Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
MPIfR-BDG
psrdada_cpp
Commits
743c795c
Commit
743c795c
authored
Apr 14, 2021
by
Tobias Winchen
Browse files
Improve speed in detect accuulate for low channel number and float output
parent
e179b93f
Changes
1
Hide whitespace changes
Inline
Side-by-side
psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh
View file @
743c795c
...
...
@@ -16,7 +16,7 @@ __global__
void
detect_and_accumulate
(
float2
const
*
__restrict__
in
,
int8_t
*
__restrict__
out
,
int
nchans
,
int
nsamps
,
int
naccumulate
,
float
scale
,
float
offset
,
int
stride
,
int
out_offset
)
{
// grid stride loop over output array to keep
// grid stride loop over output array to keep
for
(
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
(
i
<
nsamps
*
nchans
/
naccumulate
);
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
double
sum
=
0.0
f
;
...
...
@@ -30,36 +30,52 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o
double
y
=
tmp
.
y
*
tmp
.
y
;
sum
+=
x
+
y
;
}
size_t
toff
=
out_offset
*
nchans
+
currentOutputSpectra
*
nchans
*
stride
;
size_t
toff
=
out_offset
*
nchans
+
currentOutputSpectra
*
nchans
*
stride
;
out
[
toff
+
i
]
+=
(
int8_t
)
((
sum
-
offset
)
/
scale
);
// no atomic add for int8, thus no optimized version here.A tomic add can be
// implemented using an in32 atomicAdd and bit shifting, but this needs more effort.
}
}
template
<
typename
T
>
__global__
void
detect_and_accumulate
(
float2
const
*
__restrict__
in
,
float
*
__restrict__
out
,
int
nchans
,
int
nsamps
,
int
naccumulate
,
float
scale
,
float
offset
,
int
stride
,
int
out_offset
)
{
for
(
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
(
i
<
nsamps
*
nchans
/
naccumulate
);
i
+=
blockDim
.
x
*
gridDim
.
x
)
const
int
nb
=
naccumulate
/
blockDim
.
x
+
1
;
const
int
bs
=
blockDim
.
x
;
const
int
number_of_spectra
=
nsamps
/
(
nchans
*
naccumulate
);
for
(
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
(
i
<
nsamps
*
nchans
/
naccumulate
*
nb
);
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
double
sum
=
0
;
size_t
currentOutputSpectra
=
i
/
nchans
;
size_t
currentChannel
=
i
%
nchans
;
const
size_t
bn
=
i
/
nchans
/
number_of_spectra
;
const
size_t
currentOutputSpectra
=
i
/
nchans
;
const
size_t
currentChannel
=
i
%
nchans
;
for
(
size_t
j
=
0
;
j
<
naccumulate
;
j
++
)
double
sum
=
0
;
for
(
size_t
k
=
0
;
k
<
bs
;
k
++
)
{
size_t
j
=
k
+
bn
*
bs
;
if
(
j
>=
naccumulate
)
break
;
float2
tmp
=
in
[
j
*
nchans
+
currentOutputSpectra
*
nchans
*
naccumulate
+
currentChannel
];
double
x
=
tmp
.
x
*
tmp
.
x
;
double
y
=
tmp
.
y
*
tmp
.
y
;
sum
+=
x
+
y
;
}
size_t
toff
=
out_offset
*
nchans
+
currentOutputSpectra
*
nchans
*
stride
;
out
[
i
+
toff
]
+=
sum
;
atomicAdd
(
&
out
[
toff
+
currentChannel
],
((
sum
-
offset
)
/
scale
));
}
}
}
// namespace kernels
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment