Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
MPIfR-BDG
psrdada_cpp
Commits
64d92803
Commit
64d92803
authored
Feb 05, 2021
by
Tobias Winchen
Browse files
Merge branch 'devel' into 'devel'
Added 10 it unpacker on GPU See merge request
!4
parents
b3619f8d
f9ab54d5
Pipeline
#92900
failed with stages
in 5 minutes and 36 seconds
Changes
6
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
psrdada_cpp/effelsberg/edd/Unpacker.cuh
View file @
64d92803
...
...
@@ -12,6 +12,9 @@ namespace kernels {
__global__
void
unpack_edd_12bit_to_float32
(
uint64_t
const
*
__restrict__
in
,
float
*
__restrict__
out
,
int
n
);
__global__
void
unpack_edd_10bit_to_float32
(
uint64_t
const
*
__restrict__
in
,
float
*
__restrict__
out
,
int
n
);
__global__
void
unpack_edd_8bit_to_float32
(
uint64_t
const
*
__restrict__
in
,
float
*
__restrict__
out
,
int
n
);
...
...
@@ -33,7 +36,7 @@ public:
void
unpack
(
const
uint64_t
*
input
,
float
*
output
,
size_t
size
);
template
<
int
Nbits
>
void
unpack
(
InputType
const
&
input
,
OutputType
&
output
)
void
unpack
(
InputType
const
&
input
,
OutputType
&
output
)
{
InputType
::
value_type
const
*
input_ptr
=
thrust
::
raw_pointer_cast
(
input
.
data
());
OutputType
::
value_type
*
output_ptr
=
thrust
::
raw_pointer_cast
(
output
.
data
());
...
...
psrdada_cpp/effelsberg/edd/detail/GatedSpectrometer.cu
View file @
64d92803
...
...
@@ -75,7 +75,7 @@ GatedSpectrometer<HandlerType, InputType, OutputType>::GatedSpectrometer(
{
// Sanity checks
assert
(((
nbits
==
12
)
||
(
nbits
==
8
)));
assert
(((
nbits
==
12
)
||
(
nbits
==
8
)
||
(
nbits
==
10
)
));
assert
(
_naccumulate
>
0
);
// check for any device errors
...
...
psrdada_cpp/effelsberg/edd/detail/GatedStokesSpectrometer.cu
View file @
64d92803
...
...
@@ -160,7 +160,7 @@ GatedStokesSpectrometer<HandlerType>::GatedStokesSpectrometer(
_call_count
(
0
),
_nsamps_per_heap
(
4096
),
_processing_efficiency
(
0.
){
// Sanity checks
assert
(((
_nbits
==
12
)
||
(
_nbits
==
8
)));
assert
(((
_nbits
==
12
)
||
(
_nbits
==
8
)
||
(
_nbits
==
10
)
));
assert
(
_naccumulate
>
0
);
// check for any device errors
...
...
psrdada_cpp/effelsberg/edd/src/GatedSpectrometer_cli.cu
View file @
64d92803
...
...
@@ -198,7 +198,7 @@ int main(int argc, char **argv) {
desc
.
add_options
()(
"nbits,b"
,
po
::
value
<
unsigned
int
>
(
&
ip
.
nbits
)
->
required
(),
"The number of bits per sample in the "
"packetiser output (8
or
12)"
);
"packetiser output (8
, 10
12)"
);
desc
.
add_options
()(
"fft_length,n"
,
po
::
value
<
size_t
>
(
&
ip
.
fft_length
)
->
required
(),
"The length of the FFT to perform on the data"
);
desc
.
add_options
()(
"naccumulate,a"
,
...
...
psrdada_cpp/effelsberg/edd/src/Unpacker.cu
View file @
64d92803
...
...
@@ -2,6 +2,7 @@
#include "psrdada_cpp/cuda_utils.hpp"
#define EDD_NTHREADS_UNPACK 512
#define EDD_NTHREADS_UNPACK10 128 // More than 128 threads are silently not launched??
namespace
psrdada_cpp
{
namespace
effelsberg
{
...
...
@@ -21,6 +22,93 @@ __device__ __forceinline__ uint64_t swap64(uint64_t x)
return
result
;
}
__global__
void
unpack_edd_10bit_to_float32
(
uint64_t
const
*
__restrict__
in
,
float
*
__restrict__
out
,
int
n
)
{
/**
* Note: This kernels will not work with more than 256 threads.
*/
__shared__
volatile
float
tmp_out
[
EDD_NTHREADS_UNPACK10
*
32
];
__shared__
volatile
uint64_t
tmp_in
[
EDD_NTHREADS_UNPACK10
*
5
];
int
block_idx
=
blockIdx
.
x
;
uint64_t
val
;
uint64_t
rest
;
volatile
float
*
sout
=
tmp_out
+
(
32
*
threadIdx
.
x
);
for
(
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
(
5
*
idx
+
4
)
<
n
;
idx
+=
gridDim
.
x
*
blockDim
.
x
)
{
//Read to shared memeory
int
block_read_start
=
block_idx
*
EDD_NTHREADS_UNPACK10
*
5
;
tmp_in
[
threadIdx
.
x
]
=
in
[
block_read_start
+
threadIdx
.
x
];
tmp_in
[
EDD_NTHREADS_UNPACK10
+
threadIdx
.
x
]
=
in
[
block_read_start
+
EDD_NTHREADS_UNPACK10
+
threadIdx
.
x
];
tmp_in
[
EDD_NTHREADS_UNPACK10
*
2
+
threadIdx
.
x
]
=
in
[
block_read_start
+
EDD_NTHREADS_UNPACK10
*
2
+
threadIdx
.
x
];
tmp_in
[
EDD_NTHREADS_UNPACK10
*
3
+
threadIdx
.
x
]
=
in
[
block_read_start
+
EDD_NTHREADS_UNPACK10
*
3
+
threadIdx
.
x
];
tmp_in
[
EDD_NTHREADS_UNPACK10
*
4
+
threadIdx
.
x
]
=
in
[
block_read_start
+
EDD_NTHREADS_UNPACK10
*
4
+
threadIdx
.
x
];
__syncthreads
();
val
=
swap64
(
tmp_in
[
5
*
threadIdx
.
x
]);
sout
[
0
]
=
(
float
)((
int64_t
)((
0xFFC0000000000000
&
val
)
<<
0
)
>>
54
);
sout
[
1
]
=
(
float
)((
int64_t
)((
0x003FF00000000000
&
val
)
<<
10
)
>>
54
);
sout
[
2
]
=
(
float
)((
int64_t
)((
0x00000FFC00000000
&
val
)
<<
20
)
>>
54
);
sout
[
3
]
=
(
float
)((
int64_t
)((
0x00000003FF000000
&
val
)
<<
30
)
>>
54
);
sout
[
4
]
=
(
float
)((
int64_t
)((
0x0000000000FFC000
&
val
)
<<
40
)
>>
54
);
sout
[
5
]
=
(
float
)((
int64_t
)((
0x0000000000003FF0
&
val
)
<<
50
)
>>
54
);
rest
=
(
0x000000000000000F
&
val
)
<<
60
;
val
=
swap64
(
tmp_in
[
5
*
threadIdx
.
x
+
1
]);
sout
[
6
]
=
(
float
)((
int64_t
)(((
0xFC00000000000000
&
val
)
>>
4
)
|
rest
)
>>
54
);
sout
[
7
]
=
(
float
)((
int64_t
)((
0x03FF000000000000
&
val
)
<<
6
)
>>
54
);
sout
[
8
]
=
(
float
)((
int64_t
)((
0x0000FFC000000000
&
val
)
<<
16
)
>>
54
);
sout
[
9
]
=
(
float
)((
int64_t
)((
0x0000003FF0000000
&
val
)
<<
26
)
>>
54
);
sout
[
10
]
=
(
float
)((
int64_t
)((
0x000000000FFC0000
&
val
)
<<
36
)
>>
54
);
sout
[
11
]
=
(
float
)((
int64_t
)((
0x000000000003FF00
&
val
)
<<
46
)
>>
54
);
rest
=
(
0x00000000000000FF
&
val
)
<<
56
;
val
=
swap64
(
tmp_in
[
5
*
threadIdx
.
x
+
2
]);
sout
[
12
]
=
(
float
)((
int64_t
)(((
0xC000000000000000
&
val
)
>>
8
)
|
rest
)
>>
54
);
sout
[
13
]
=
(
float
)((
int64_t
)((
0x3FF0000000000000
&
val
)
<<
2
)
>>
54
);
sout
[
14
]
=
(
float
)((
int64_t
)((
0x000FFC0000000000
&
val
)
<<
12
)
>>
54
);
sout
[
15
]
=
(
float
)((
int64_t
)((
0x000003FF00000000
&
val
)
<<
22
)
>>
54
);
sout
[
16
]
=
(
float
)((
int64_t
)((
0x00000000FFC00000
&
val
)
<<
32
)
>>
54
);
sout
[
17
]
=
(
float
)((
int64_t
)((
0x00000000003FF000
&
val
)
<<
42
)
>>
54
);
sout
[
18
]
=
(
float
)((
int64_t
)((
0x0000000000000FFC
&
val
)
<<
52
)
>>
54
);
rest
=
(
0x0000000000000003
&
val
)
<<
62
;
val
=
swap64
(
tmp_in
[
5
*
threadIdx
.
x
+
3
]);
sout
[
19
]
=
(
float
)((
int64_t
)(((
0xFF00000000000000
&
val
)
>>
2
)
|
rest
)
>>
54
);
sout
[
20
]
=
(
float
)((
int64_t
)((
0x00FFC00000000000
&
val
)
<<
8
)
>>
54
);
sout
[
21
]
=
(
float
)((
int64_t
)((
0x00003FF000000000
&
val
)
<<
18
)
>>
54
);
sout
[
22
]
=
(
float
)((
int64_t
)((
0x0000000FFC000000
&
val
)
<<
28
)
>>
54
);
sout
[
23
]
=
(
float
)((
int64_t
)((
0x0000000003FF0000
&
val
)
<<
38
)
>>
54
);
sout
[
24
]
=
(
float
)((
int64_t
)((
0x000000000000FFC0
&
val
)
<<
48
)
>>
54
);
rest
=
(
0x000000000000003F
&
val
)
<<
58
;
val
=
swap64
(
tmp_in
[
5
*
threadIdx
.
x
+
4
]);
sout
[
25
]
=
(
float
)((
int64_t
)(((
0xF000000000000000
&
val
)
>>
6
)
|
rest
)
>>
54
);
sout
[
26
]
=
(
float
)((
int64_t
)((
0x0FFC000000000000
&
val
)
<<
4
)
>>
54
);
sout
[
27
]
=
(
float
)((
int64_t
)((
0x0003FF0000000000
&
val
)
<<
14
)
>>
54
);
sout
[
28
]
=
(
float
)((
int64_t
)((
0x000000FFC0000000
&
val
)
<<
24
)
>>
54
);
sout
[
29
]
=
(
float
)((
int64_t
)((
0x000000003FF00000
&
val
)
<<
34
)
>>
54
);
sout
[
30
]
=
(
float
)((
int64_t
)((
0x00000000000FFC00
&
val
)
<<
44
)
>>
54
);
sout
[
31
]
=
(
float
)((
int64_t
)((
0x00000000000003FF
&
val
)
<<
54
)
>>
54
);
rest
=
0
;
__syncthreads
();
size_t
block_write_start
=
block_idx
*
EDD_NTHREADS_UNPACK10
*
32
;
for
(
size_t
ii
=
threadIdx
.
x
;
ii
<
32
*
EDD_NTHREADS_UNPACK10
;
ii
+=
blockDim
.
x
)
{
out
[
block_write_start
+
ii
]
=
tmp_out
[
ii
];
}
block_idx
+=
gridDim
.
x
;
__syncthreads
();
}
}
__global__
void
unpack_edd_12bit_to_float32
(
uint64_t
const
*
__restrict__
in
,
float
*
__restrict__
out
,
int
n
)
{
...
...
@@ -139,6 +227,17 @@ void Unpacker::unpack<8>(const uint64_t* input, float* output, size_t size)
input
,
output
,
size
);
}
template
<
>
void
Unpacker
::
unpack
<
10
>
(
const
uint64_t
*
input
,
float
*
output
,
size_t
size
)
{
BOOST_LOG_TRIVIAL
(
debug
)
<<
"Unpacking 10-bit data"
;
int
nblocks
=
size
/
EDD_NTHREADS_UNPACK10
;
kernels
::
unpack_edd_10bit_to_float32
<<<
nblocks
,
EDD_NTHREADS_UNPACK10
,
0
,
_stream
>>>
(
input
,
output
,
size
);
}
}
//namespace edd
}
//namespace effelsberg
}
//namespace psrdada_cpp
psrdada_cpp/effelsberg/edd/test/src/UnpackerTester.cu
View file @
64d92803
...
...
@@ -110,7 +110,7 @@ void UnpackerTester::unpacker_10_to_32_c_reference(
static_cast
<
int64_t
>
((
0x0000000000FFC000
&
val
)
<<
40
)
>>
54
));
output
.
push_back
(
static_cast
<
float
>
(
static_cast
<
int64_t
>
((
0x0000000000003FF0
&
val
)
<<
50
)
>>
54
));
rest
=
(
0x000000000000000F
&
val
)
<<
60
;
// 4 bits rest.
rest
=
(
0x000000000000000F
&
val
)
<<
60
;
// 4 bits rest.
val
=
be64toh
(
input
[
ii
+
1
]);
output
.
push_back
(
static_cast
<
float
>
(
static_cast
<
int64_t
>
(((
0xFC00000000000000
&
val
)
>>
4
)
|
rest
)
>>
54
));
...
...
@@ -205,10 +205,11 @@ void UnpackerTester::compare_against_host(
OutputType
const
&
host_output
)
{
OutputType
copy_from_gpu
=
gpu_output
;
cudaDeviceSynchronize
();
ASSERT_EQ
(
host_output
.
size
(),
copy_from_gpu
.
size
());
for
(
std
::
size_t
ii
=
0
;
ii
<
host_output
.
size
();
++
ii
)
{
ASSER
T_EQ
(
host_output
[
ii
],
copy_from_gpu
[
ii
]);
EXPEC
T_EQ
(
host_output
[
ii
],
copy_from_gpu
[
ii
])
<<
" ii = "
<<
ii
;
}
}
...
...
@@ -240,7 +241,7 @@ TEST_F(UnpackerTester, 8_bit_unpack_test)
InputType
host_input
(
n
);
for
(
size_t
ii
=
0
;
ii
<
n
;
++
ii
)
{
host_input
[
ii
]
=
distribution
(
generator
);
host_input
[
ii
]
=
distribution
(
generator
);
}
Unpacker
::
InputType
gpu_input
=
host_input
;
Unpacker
::
OutputType
gpu_output
;
...
...
@@ -253,6 +254,29 @@ TEST_F(UnpackerTester, 8_bit_unpack_test)
}
TEST_F
(
UnpackerTester
,
10
_bit_unpack_test
)
{
std
::
size_t
n
=
640
;
std
::
default_random_engine
generator
;
std
::
uniform_int_distribution
<
int
>
distribution
(
1
,
1
<<
31
);
InputType
host_input
(
n
);
for
(
size_t
ii
=
0
;
ii
<
n
;
++
ii
)
{
host_input
[
ii
]
=
distribution
(
generator
);
}
Unpacker
::
InputType
gpu_input
=
host_input
;
Unpacker
::
OutputType
gpu_output
;
gpu_output
.
resize
(
host_input
.
size
()
*
sizeof
(
host_input
[
0
])
*
8
/
10
);
OutputType
host_output
;
Unpacker
unpacker
(
_stream
);
unpacker
.
unpack
<
10
>
(
gpu_input
,
gpu_output
);
unpacker_10_to_32_c_reference
(
host_input
,
host_output
);
compare_against_host
(
gpu_output
,
host_output
);
}
}
//namespace test
}
//namespace edd
}
//namespace meerkat
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment