Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
elpa
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
11
Issues
11
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Environments
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
elpa
elpa
Commits
4d0b0ab1
Commit
4d0b0ab1
authored
May 29, 2019
by
Andreas Marek
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Start to implement real NEON ARCH64 kernels
parent
b844cf4d
Changes
12
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
1265 additions
and
504 deletions
+1265
-504
Doxyfile.in
Doxyfile.in
+12
-0
Makefile.am
Makefile.am
+21
-0
configure.ac
configure.ac
+32
-5
elpa/elpa_constants.h.in
elpa/elpa_constants.h.in
+7
-4
src/elpa2/compute_hh_trafo.F90
src/elpa2/compute_hh_trafo.F90
+197
-0
src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c
src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c
+642
-495
src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c
src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c
+59
-0
src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c
src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c
+59
-0
src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c
src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c
+59
-0
src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c
src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c
+59
-0
src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c
src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c
+59
-0
src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c
src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c
+59
-0
No files found.
Doxyfile.in
View file @
4d0b0ab1
...
...
@@ -908,10 +908,14 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/asm_x86_64_double_precision.s \
@top_srcdir@/src/elpa2/kernels/real_avx512_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/asm_x86_64_single_precision.s \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_single_precision.c \
...
...
@@ -929,19 +933,27 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_bgq.f90 \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_simple.F90 \
@top_srcdir@/src/elpa2/kernels/complex.F90 \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_bgp.f90 \
@top_srcdir@/src/elpa2/kernels/real_sse_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c \
@top_srcdir@/src/elpa2/elpa2_compute_complex_template.F90 \
@top_srcdir@/src/elpa2/elpa2_bandred_template.F90 \
@top_srcdir@/src/elpa2/pack_unpack_gpu.F90 \
...
...
Makefile.am
View file @
4d0b0ab1
...
...
@@ -227,6 +227,13 @@ if WITH_REAL_SPARC64_BLOCK2_KERNEL
#endif
endif
if
WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c
if
WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c
endif
endif
if
WITH_REAL_VSX_BLOCK2_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_vsx_2hv_double_precision.c
if
WANT_SINGLE_PRECISION_REAL
...
...
@@ -269,6 +276,13 @@ if WITH_REAL_SPARC64_BLOCK4_KERNEL
#endif
endif
if
WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c
if
WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c
endif
endif
if
WITH_REAL_VSX_BLOCK4_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_vsx_4hv_double_precision.c
if
WANT_SINGLE_PRECISION_REAL
...
...
@@ -311,6 +325,13 @@ if WITH_REAL_SPARC64_BLOCK6_KERNEL
#endif
endif
if
WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c
if
WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c
endif
endif
if
WITH_REAL_VSX_BLOCK6_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_vsx_6hv_double_precision.c
if
WANT_SINGLE_PRECISION_REAL
...
...
configure.ac
View file @
4d0b0ab1
...
...
@@ -636,6 +636,12 @@ m4_define(elpa_m4_sparc64_kernels, [
real_sparc64_block6
])
m4_define(elpa_m4_neon_arch64_kernels, [
real_neon_arch64_block2
real_neon_arch64_block4
real_neon_arch64_block6
])
m4_define(elpa_m4_vsx_kernels, [
real_vsx_block2
real_vsx_block4
...
...
@@ -681,7 +687,7 @@ m4_define(elpa_m4_gpu_kernels, [
complex_gpu
])
m4_define(elpa_m4_kernel_types, [generic sparc64 vsx sse sse_assembly avx avx2 avx512 bgp bgq gpu])
m4_define(elpa_m4_kernel_types, [generic sparc64
neon_arch64
vsx sse sse_assembly avx avx2 avx512 bgp bgq gpu])
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
...
...
@@ -715,6 +721,7 @@ AC_DEFUN([ELPA_SELECT_KERNELS], [
dnl Modify list of kernels with configure arguments
ELPA_SELECT_KERNELS([generic],[enable])
ELPA_SELECT_KERNELS([sparc64],[disable])
ELPA_SELECT_KERNELS([neon_arch64],[disable])
ELPA_SELECT_KERNELS([vsx],[disable])
ELPA_SELECT_KERNELS([sse],[enable])
ELPA_SELECT_KERNELS([sse_assembly],[enable])
...
...
@@ -730,7 +737,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
])
if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_
neon_arch64_kernels elpa_m4_
vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [
if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then
echo "Disabling elpa_m4_kernel due to BGP/BGQ option"
fi
...
...
@@ -790,7 +797,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[
])
fi
])
m4_foreach_w([elpa_m4_arch],[sparc64 vsx sse avx avx2 avx512],[
m4_foreach_w([elpa_m4_arch],[sparc64
neon_arch64
vsx sse avx avx2 avx512],[
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1])
...
...
@@ -848,7 +855,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels,
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_
neon_arch64_kernels elpa_m4_
vsx_kernels elpa_m4_generic_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[
if test -z "$default_[]elpa_m4_kind[]_kernel"; then
...
...
@@ -895,7 +902,6 @@ int main(int argc, char **argv) {
AC_DEFINE([HAVE_VSX_SSE],[1],[Altivec VSX intrinsics are supported on this CPU])
fi
if test x"${need_sparc64}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile SPARC64 with intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
...
...
@@ -917,6 +923,27 @@ int main(int argc, char **argv) {
AC_DEFINE([HAVE_SPARC64_SSE],[1],[SPARC64 intrinsics are supported on this CPU])
fi
if test x"${need_neon_arch64}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile NEON ARCH64 with intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <arm_neon.h>
int main(int argc, char **argv) {
__Float64x2_t x1, x2, x3, x4;
x4 = vfmaq_64(x1, x2, x3);
return 0;
}
])],
[can_compile_neon_arch64=yes],
[can_compile_neon_arch64=no]
)
AC_MSG_RESULT([${can_compile_neon_arch64}])
if test x"$can_compile_neon_arch64" != x"yes"; then
AC_MSG_ERROR([Could not compile test program, try with --disable-neon_arch64, or adjust the C compiler or CFLAGS])
fi
AC_DEFINE([HAVE_NEON_ARCH64_SSE],[1],[NEON_ARCH64 intrinsics are supported on this CPU])
fi
if test x"${need_sse}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile SSE3 with gcc intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
...
...
elpa/elpa_constants.h.in
View file @
4d0b0ab1
...
...
@@ -44,10 +44,13 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 22, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 23, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 24, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 25, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2, 22, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4, 23, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6, 24, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 25, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 26, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 27, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 28, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
...
...
src/elpa2/compute_hh_trafo.F90
View file @
4d0b0ab1
...
...
@@ -339,6 +339,7 @@
kernel
.eq.
ELPA_2STAGE_REAL_AVX512_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SSE_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SPARC64_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_VSX_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_GENERIC
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_GENERIC_SIMPLE
.or.
&
...
...
@@ -850,6 +851,43 @@
#endif /* REALCASE == 1 */
#if REALCASE == 1
! implementation of neon_arch64 block 2 real case
#if defined(WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if
(
kernel
.eq.
ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2
)
then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL))
do
j
=
ncols
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
#ifdef WITH_OPENMP
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
neon_arch64_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
neon_arch64_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL */
#endif /* REALCASE == 1 */
#if REALCASE == 1
! implementation of vsx block 2 real case
...
...
@@ -1656,6 +1694,77 @@
#endif /* REALCASE */
#if REALCASE == 1
! neon_arch64 block 4 real kernel
#if defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if
(
kernel
.eq.
ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4
)
then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL))
do
j
=
ncols
,
4
,
-4
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
w
(:,
3
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-2
)
w
(:,
4
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-3
)
#ifdef WITH_OPENMP
call
quad_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
neon_arch64_4hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-3
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
quad_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
neon_arch64_4hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-3
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
do
jj
=
j
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
-1
)
#ifdef WITH_OPENMP
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
neon_arch64_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jj
+
off
+
a_off
-1
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
neon_arch64_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jj
+
off
+
a_off
-1
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
#ifdef WITH_OPENMP
if
(
jj
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
cpu_openmp_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
1
+
off
+
a_off
:
1
+
off
+
a_off
+
nbw
-1
,
istripe
,
my_thread
),
&
bcast_buffer
(
1
:
nbw
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#else
if
(
jj
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
cpu_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
1
+
off
+
a_off
:
1
+
off
+
a_off
+
nbw
-1
,
istripe
),
bcast_buffer
(
1
:
nbw
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#endif
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL */
#endif /* REALCASE */
#if REALCASE == 1
! vsx block4 real kernel
...
...
@@ -2040,6 +2149,94 @@
#endif /* REALCASE */
#if REALCASE == 1
!neon_arch64 block6 real kernel
#if defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if
(
kernel
.eq.
ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6
)
then
#endif /* not WITH_FIXED_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
do
j
=
ncols
,
6
,
-6
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
w
(:,
3
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-2
)
w
(:,
4
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-3
)
w
(:,
5
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-4
)
w
(:,
6
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-5
)
#ifdef WITH_OPENMP
call
hexa_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
neon_arch64_6hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-5
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
hexa_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
neon_arch64_6hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-5
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
do
jj
=
j
,
4
,
-4
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
-1
)
w
(:,
3
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
-2
)
w
(:,
4
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
-3
)
#ifdef WITH_OPENMP
call
quad_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
neon_arch64_4hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jj
+
off
+
a_off
-3
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
quad_hh_trafo_
&
&
MATH_DATATYPE
&
&
neon_arch64_4hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jj
+
off
+
a_off
-3
,
istripe
)),
w
,
&
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
do
jjj
=
jj
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
jjj
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
jjj
+
off
-1
)
#ifdef WITH_OPENMP
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
neon_arch64_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jjj
+
off
+
a_off
-1
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
neon_arch64_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jjj
+
off
+
a_off
-1
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
#ifdef WITH_OPENMP
if
(
jjj
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
cpu_openmp_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
1
+
off
+
a_off
:
1
+
off
+
a_off
+
nbw
-1
,
istripe
,
my_thread
),
&
bcast_buffer
(
1
:
nbw
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#else
if
(
jjj
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
cpu_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
1
+
off
+
a_off
:
1
+
off
+
a_off
+
nbw
-1
,
istripe
),
bcast_buffer
(
1
:
nbw
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#endif
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL */
#endif /* REALCASE */
#if REALCASE == 1
!vsx block6 real kernel
#if defined(WITH_REAL_VSX_BLOCK6_KERNEL)
...
...
src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c
View file @
4d0b0ab1
...
...
@@ -65,17 +65,28 @@
#define CONCAT_3ARGS(a, b, c) CONCAT2_3ARGS(a, b, c)
#define CONCAT2_3ARGS(a, b, c) a ## b ## c
//define instruction set numbers
#define NEON_ARCH64_128 1285
#if VEC_SET == 128 || VEC_SET == 256 || VEC_SET == 512
#include <x86intrin.h>
#endif
#if VEC_SET == 1281
#include <fjmfunc.h>
#include <emmintrin.h>
#endif
#if VEC_SET == 1282
#include <altivec.h>
#endif
#if VEC_SET == NEON_ARCH64_128
#include <arm_neon.h>
#endif
#include <stdio.h>
#include <stdlib.h>
...
...
@@ -106,6 +117,10 @@
#define SIMD_SET VSX
#endif
#if VEC_SET == NEON_ARCH64_128
#define SIMD_SET NEON_ARCH64
#endif
#if VEC_SET == 256
#define SIMD_SET AVX_AVX2
#endif
...
...
@@ -129,6 +144,10 @@
#if VEC_SET == 128
#define _SIMD_SET _mm_set_pd
#define _SIMD_SET1 _mm_set1_pd
#define _SIMD_NEG 1
#endif
#if VEC_SET == 1281
#define _SIMD_NEG _fjsp_neg_v2r8
#endif
#endif /* DOUBLE_PRECISION_REAL */
#ifdef SINGLE_PRECISION_REAL
...
...
@@ -143,7 +162,11 @@
#if VEC_SET == 128
#define _SIMD_SET _mm_set_ps
#define _SIMD_SET1 _mm_set1_ps
#define _SIMD_NEG 1
#endif
#if VEC_SET == 1281
#define _SIMD_NEG 1
#endif
#endif /* SINGLE_PRECISION_REAL */
#endif /* VEC_SET == 128 || VEC_SET == 1281 */
...
...
@@ -161,6 +184,7 @@
#define _SIMD_LOAD (__vector float) vec_ld
#endif
#define _SIMD_NEG 1
#define _SIMD_STORE vec_st
#define _SIMD_ADD vec_add
#define _SIMD_MUL vec_mul
...
...
@@ -168,6 +192,35 @@
#endif /* VEC_SET == 1281 */
#if VEC_SET == NEON_ARCH64_128
#ifdef DOUBLE_PRECISION_REAL
#define offset 2
#define __SIMD_DATATYPE __Float64x2_t
#define _SIMD_LOAD vld1q_f64
#define _SIMD_STORE vst1q_f64
#define _SIMD_ADD vaddq_f64
#define _SIMD_MUL vmulq_f64
#define _SIMD_SUB vsubq_f64
#define _SIMD_NEG vnegq_f64
// FMA
//#define _SIMD_XOR _mm_xor_pd
#define _SIMD_SET1 vdupq_n_f64
#endif /* DOUBLE_PRECISION_REAL */
#ifdef SINGLE_PRECISION_REAL
#define offset 4
#define __SIMD_DATATYPE __Float32x4_t
#define _SIMD_LOAD vld1q_f32
#define _SIMD_STORE vst1q_f32
#define _SIMD_ADD vaddq_f32
#define _SIMD_MUL vmulq_f32
#define _SIMD_SUB vsubq_f32
#define _SIMD_NEG vnegq_f32
//FMA
//#define _SIMD_XOR _mm_xor_ps
#define _SIMD_SET1 vdupq_n_f32
#endif /* SINGLE_PRECISION_REAL */
#endif /* VEC_SET == 128 || VEC_SET == 1281 */
#if VEC_SET == 256
#ifdef DOUBLE_PRECISION_REAL
#define offset 4
...
...
@@ -180,6 +233,7 @@
#define _SIMD_SET1 _mm256_set1_pd
#define _SIMD_XOR _mm256_xor_pd
#define _SIMD_BROADCAST _mm256_broadcast_sd
#define _SIMD_NEG 1
#ifdef HAVE_AVX2
#ifdef __FMA4__
#define __ELPA_USE_FMA__
...
...
@@ -213,6 +267,7 @@
#define _SIMD_SET1 _mm256_set1_ps
#define _SIMD_XOR _mm256_xor_ps
#define _SIMD_BROADCAST _mm256_broadcast_ss
#define _SIMD_NEG 1
#ifdef HAVE_AVX2
#ifdef __FMA4__
#define __ELPA_USE_FMA__
...
...
@@ -247,6 +302,7 @@
#define _SIMD_MUL _mm512_mul_pd
#define _SIMD_SUB _mm512_sub_pd
#define _SIMD_SET1 _mm512_set1_pd
#define _SIMD_NEG 1
#ifdef HAVE_AVX512_XEON
#define _SIMD_XOR _mm512_xor_pd
#endif
...
...
@@ -273,6 +329,7 @@
#define _SIMD_MUL _mm512_mul_ps
#define _SIMD_SUB _mm512_sub_ps
#define _SIMD_SET1 _mm512_set1_ps
#define _SIMD_NEG 1
#ifdef HAVE_AVX512_XEON
#define _SIMD_XOR _mm512_xor_ps
#endif
...
...
@@ -305,7 +362,7 @@
#undef __AVX__
#endif
#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET == 256 || VEC_SET == 512
#if VEC_SET == 128 || VEC_SET == 1281 || VEC_SET ==
NEON_ARCH64_128 || VEC_SET ==
256 || VEC_SET == 512
#undef _LOAD
#undef _STORE
#undef _XOR
...
...
@@ -324,7 +381,7 @@
#endif
#if VEC_SET == 128 || VEC_SET == 1281
#if VEC_SET == 128 || VEC_SET == 1281
|| VEC_SET == NEON_ARCH64_128
//Forward declaration
#ifdef DOUBLE_PRECISION_REAL
#undef ROW_LENGTH
...
...
@@ -334,7 +391,7 @@
#undef ROW_LENGTH
#define ROW_LENGTH 4
#endif
#endif /* VEC_SET == 128 || VEC_SET == 1281 */
#endif /* VEC_SET == 128 || VEC_SET == 1281
|| VEC_SET == NEON_ARCH64_128
*/
#if VEC_SET == 256
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -368,7 +425,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
DATA_TYPE_PTR scalarprods);
#endif
#if VEC_SET == 128 || VEC_SET == 1281
#if VEC_SET == 128 || VEC_SET == 1281
|| VEC_SET == NEON_ARCH64_128
#ifdef DOUBLE_PRECISION_REAL
#undef ROW_LENGTH
#define ROW_LENGTH 4
...
...
@@ -377,7 +434,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#undef ROW_LENGTH
#define ROW_LENGTH 8
#endif
#endif /* VEC_SET == 128 || VEC_SET == 1281 */
#endif /* VEC_SET == 128 || VEC_SET == 1281
|| VEC_SET == NEON_ARCH64_128
*/
#if VEC_SET == 256
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -411,7 +468,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
DATA_TYPE_PTR scalarprods);
#endif
#if VEC_SET == 128 || VEC_SET == 1281
#if VEC_SET == 128 || VEC_SET == 1281
|| VEC_SET == NEON_ARCH64_128
#ifdef DOUBLE_PRECISION_REAL
#undef ROW_LENGTH
#define ROW_LENGTH 6
...
...
@@ -420,7 +477,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#undef ROW_LENGTH
#define ROW_LENGTH 12
#endif
#endif /* VEC_SET == 128 || VEC_SET == 1281 */
#endif /* VEC_SET == 128 || VEC_SET == 1281
|| VEC_SET == NEON_ARCH64_128
*/
#if VEC_SET == 256
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -455,7 +512,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
DATA_TYPE_PTR scalarprods);
#endif
#if VEC_SET == 128 || VEC_SET == 1281
#if VEC_SET == 128 || VEC_SET == 1281
|| VEC_SET == NEON_ARCH64_128
#ifdef DOUBLE_PRECISION_REAL
#undef ROW_LENGTH
#define ROW_LENGTH 8
...
...
@@ -464,7 +521,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#undef ROW_LENGTH
#define ROW_LENGTH 16
#endif
#endif /* VEC_SET == 128 || VEC_SET == 1281 */
#endif /* VEC_SET == 128 || VEC_SET == 1281
|| VEC_SET == NEON_ARCH64_128
*/
#if VEC_SET == 256
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -499,7 +556,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
DATA_TYPE_PTR scalarprods);
#endif
#if VEC_SET == 128 || VEC_SET == 1281
#if VEC_SET == 128 || VEC_SET == 1281
|| VEC_SET == NEON_ARCH64_128
#ifdef DOUBLE_PRECISION_REAL
#undef ROW_LENGTH
#define ROW_LENGTH 10
...
...
@@ -508,7 +565,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#undef ROW_LENGTH
#define ROW_LENGTH 20
#endif
#endif /* VEC_SET == 128 || VEC_SET == 1281 */
#endif /* VEC_SET == 128 || VEC_SET == 1281
|| VEC_SET == NEON_ARCH64_128
*/
#if VEC_SET == 256
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -543,7 +600,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
DATA_TYPE_PTR scalarprods);
#endif
#if VEC_SET == 128 || VEC_SET == 1281
#if VEC_SET == 128 || VEC_SET == 1281
|| VEC_SET == NEON_ARCH64_128
#ifdef DOUBLE_PRECISION_REAL