Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
40b14bce
Commit
40b14bce
authored
Nov 20, 2020
by
Andreas Marek
Browse files
Make SVE512 kernels known in ELPA
parent
95597586
Changes
7
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
configure.ac
View file @
40b14bce
...
...
@@ -825,12 +825,12 @@ m4_define(elpa_m4_avx512_kernels, [
complex_avx512_block2
])
m4_define(elpa_m4_
v
se512_kernels, [
real_
v
se512_block2
real_
v
se512_block4
real_
v
se512_block6
complex_
v
se512_block1
complex_
v
se512_block2
m4_define(elpa_m4_s
v
e512_kernels, [
real_s
v
e512_block2
real_s
v
e512_block4
real_s
v
e512_block6
complex_s
v
e512_block1
complex_s
v
e512_block2
])
m4_define(elpa_m4_bgp_kernels, [
...
...
@@ -848,7 +848,7 @@ m4_define(elpa_m4_gpu_kernels, [
complex_gpu
])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512
v
se512 bgp bgq gpu])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 s
v
e512 bgp bgq gpu])
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
...
...
@@ -889,7 +889,7 @@ ELPA_SELECT_KERNELS([sse_assembly],[enable])
ELPA_SELECT_KERNELS([avx],[enable])
ELPA_SELECT_KERNELS([avx2],[enable])
ELPA_SELECT_KERNELS([avx512],[enable])
ELPA_SELECT_KERNELS([
v
se512],[disable])
ELPA_SELECT_KERNELS([s
v
e512],[disable])
ELPA_SELECT_KERNELS([gpu],[disable])
ELPA_SELECT_KERNELS([bgp],[disable])
ELPA_SELECT_KERNELS([bgq],[disable])
...
...
@@ -899,7 +899,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
])
if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels elpa_m4_
v
se512_kernels, [
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels elpa_m4_s
v
e512_kernels, [
if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then
echo "Disabling elpa_m4_kernel due to BGP/BGQ option"
fi
...
...
@@ -959,7 +959,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[
])
fi
])
m4_foreach_w([elpa_m4_arch],[sparc64 neon_arch64 vsx sse avx avx2 avx512
v
se512],[
m4_foreach_w([elpa_m4_arch],[sparc64 neon_arch64 vsx sse avx avx2 avx512 s
v
e512],[
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1])
...
...
@@ -1017,7 +1017,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_
v
se512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_gpu_kernels,
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_s
v
e512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_gpu_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[
if test -z "$default_[]elpa_m4_kind[]_kernel"; then
...
...
@@ -1280,6 +1280,8 @@ if test x"${need_avx512}" = x"yes"; then
)
AC_MSG_RESULT([${can_compile_avx512_xeon}])
can_compile_sve512_xeon = "no"
AC_MSG_CHECKING([whether we compile for Xeon PHI])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
...
...
elpa/elpa_constants.h.in
View file @
40b14bce
...
...
@@ -60,9 +60,9 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 25, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 26, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 27, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_
V
SE512_BLOCK2, 28, @ELPA_2STAGE_REAL_
V
SE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_
V
SE512_BLOCK4, 29, @ELPA_2STAGE_REAL_
V
SE512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_
V
SE512_BLOCK6, 30, @ELPA_2STAGE_REAL_
V
SE512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_S
V
E512_BLOCK2, 28, @ELPA_2STAGE_REAL_S
V
E512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_S
V
E512_BLOCK4, 29, @ELPA_2STAGE_REAL_S
V
E512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_S
V
E512_BLOCK6, 30, @ELPA_2STAGE_REAL_S
V
E512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 31, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 32, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
...
...
@@ -90,7 +90,9 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2, 11, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1, 12, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2, 13, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK1, 14, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK2, 15, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 16, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
...
...
elpa/elpa_simd_constants.h
View file @
40b14bce
...
...
@@ -9,6 +9,6 @@
#define VSX_INSTR 9
#define ARCH64_INSTR 10
#define SPARC_INSTR 11
#define
V
SE512_INSTR 12
#define S
V
E512_INSTR 12
#define NUMBER_OF_INSTR 13
src/elpa2/compute_hh_trafo.F90
View file @
40b14bce
...
...
@@ -316,6 +316,7 @@ kernel)
kernel
.eq.
ELPA_2STAGE_REAL_SPARC64_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_VSX_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SVE512_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_GENERIC
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_GENERIC_SIMPLE
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SSE_ASSEMBLY
.or.
&
...
...
@@ -779,6 +780,7 @@ kernel)
#if REALCASE == 1
! no avx512 block1 real kernel
! no sve512 block1 real kernel
#endif /* REALCASE */
#if COMPLEXCASE == 1
...
...
@@ -812,6 +814,37 @@ kernel)
endif
! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1))
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX512_BLOCK1_KERNEL */
! sve512 block1 complex kernel
#if defined(WITH_COMPLEX_SVE512_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if
((
kernel
.eq.
ELPA_2STAGE_COMPLEX_SVE512_BLOCK1
))
then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SVE512_BLOCK2_KERNEL) )
ttt
=
mpi_wtime
()
do
j
=
ncols
,
1
,
-1
#ifdef WITH_OPENMP_TRADITIONAL
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sve512_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
,
my_thread
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#else
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sve512_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#endif
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SVE512_BLOCK2_KERNEL) ) */
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif
! ((kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK1))
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SVE512_BLOCK1_KERNEL */
#endif /* COMPLEXCASE */
#if REALCASE == 1
...
...
@@ -1295,6 +1328,43 @@ kernel)
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_AVX512_BLOCK2_KERNEL */
! implementation of sve512 block 2 real case
#if defined(WITH_REAL_SVE512_BLOCK2_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if
((
kernel
.eq.
ELPA_2STAGE_REAL_SVE512_BLOCK2
))
then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SVE512_BLOCK6_KERNEL) && !defined(WITH_REAL_SVE512_BLOCK4_KERNEL))
do
j
=
ncols
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
#ifdef WITH_OPENMP_TRADITIONAL
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sve512_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sve512_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) ... */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SVE512_BLOCK2_KERNEL */
#endif /* REALCASE */
#if COMPLEXCASE == 1
...
...
@@ -1341,6 +1411,50 @@ kernel)
endif
! ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2))
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX512_BLOCK2_KERNEL */
! implementation of vse512 block 2 complex case
#if defined(WITH_COMPLEX_SVE512_BLOCK2_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if
(
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_SVE512_BLOCK2
))
then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
ttt
=
mpi_wtime
()
do
j
=
ncols
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
#ifdef WITH_OPENMP_TRADITIONAL
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sve512_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sve512_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
#ifdef WITH_OPENMP_TRADITIONAL
if
(
j
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sve512_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
1
+
off
+
a_off
,
istripe
,
my_thread
)),
bcast_buffer
(
1
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#else
if
(
j
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sve512_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
1
+
off
+
a_off
,
istripe
)),
bcast_buffer
(
1
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#endif
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif
! ( (kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK2))
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SVE512_BLOCK2_KERNEL */
#endif /* COMPLEXCASE */
...
...
@@ -2191,10 +2305,80 @@ kernel)
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_AVX512_BLOCK4_KERNEL */
! sve512 block4 real kernel
#if defined(WITH_REAL_SVE512_BLOCK4_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if
(
kernel
.eq.
ELPA_2STAGE_REAL_SVE512_BLOCK4
)
then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SVE512_BLOCK6_KERNEL))
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do
j
=
ncols
,
4
,
-4
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
w
(:,
3
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-2
)
w
(:,
4
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-3
)
#ifdef WITH_OPENMP_TRADITIONAL
call
quad_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sve512_4hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-3
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
quad_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sve512_4hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-3
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
do
jj
=
j
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
-1
)
#ifdef WITH_OPENMP_TRADITIONAL
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sve512_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jj
+
off
+
a_off
-1
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sve512_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jj
+
off
+
a_off
-1
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
#ifdef WITH_OPENMP_TRADITIONAL
if
(
jj
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
cpu_openmp_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
1
+
off
+
a_off
:
1
+
off
+
a_off
+
nbw
-1
,
istripe
,
my_thread
),
&
bcast_buffer
(
1
:
nbw
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#else
if
(
jj
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
cpu_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
1
+
off
+
a_off
:
1
+
off
+
a_off
+
nbw
-1
,
istripe
),
&
bcast_buffer
(
1
:
nbw
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#endif
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) ) */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SVE512_BLOCK4_KERNEL */
#endif /* REALCASE */
#if COMPLEXCASE == 1
!no avx512 block4 complex kernel
!no sve512 block4 complex kernel
#endif /* COMPLEXCASE */
...
...
@@ -2812,10 +2996,95 @@ kernel)
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_AVX512_BLOCK6_KERNEL */
! sve512 block6 kernel
#if defined(WITH_REAL_SVE512_BLOCK6_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if
((
kernel
.eq.
ELPA_2STAGE_REAL_SVE512_BLOCK6
))
then
#endif /* not WITH_FIXED_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
do
j
=
ncols
,
6
,
-6
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
w
(:,
3
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-2
)
w
(:,
4
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-3
)
w
(:,
5
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-4
)
w
(:,
6
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-5
)
#ifdef WITH_OPENMP_TRADITIONAL
call
hexa_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sve512_6hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-5
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
hexa_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sve512_6hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-5
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
do
jj
=
j
,
4
,
-4
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
-1
)
w
(:,
3
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
-2
)
w
(:,
4
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
-3
)
#ifdef WITH_OPENMP_TRADITIONAL
call
quad_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sve512_4hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jj
+
off
+
a_off
-3
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
quad_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sve512_4hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jj
+
off
+
a_off
-3
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
do
jjj
=
jj
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
jjj
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
jjj
+
off
-1
)
#ifdef WITH_OPENMP_TRADITIONAL
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sve512_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jjj
+
off
+
a_off
-1
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sve512_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jjj
+
off
+
a_off
-1
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
#ifdef WITH_OPENMP_TRADITIONAL
if
(
jjj
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
cpu_openmp_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
1
+
off
+
a_off
:
1
+
off
+
a_off
+
nbw
-1
,
istripe
,
my_thread
),
&
bcast_buffer
(
1
:
nbw
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#else
if
(
jjj
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
cpu_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
1
+
off
+
a_off
:
1
+
off
+
a_off
+
nbw
-1
,
istripe
),
bcast_buffer
(
1
:
nbw
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#endif
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SVE512_BLOCK6_KERNEL */
#endif /* REALCASE */
#if COMPLEXCASE == 1
!no avx512 block6 complex kernel
!no sve512 block6 complex kernel
#endif /* COMPLEXCASE */
if
(
wantDebug
)
then
...
...
src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90
View file @
40b14bce
...
...
@@ -319,7 +319,11 @@ subroutine trans_ev_tridi_to_band_&
#ifdef DOUBLE_PRECISION_REAL
if
(
kernel
.eq.
ELPA_2STAGE_REAL_AVX512_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_AVX512_BLOCK4
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_AVX512_BLOCK6
)
then
kernel
.eq.
ELPA_2STAGE_REAL_AVX512_BLOCK6
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SVE512_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SVE512_BLOCK4
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SVE512_BLOCK6
&
)
then
stripe_width
=
((
stripe_width
+7
)/
8
)
*
8
! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
! (8 * sizeof(double) == 64)
...
...
@@ -331,7 +335,11 @@ subroutine trans_ev_tridi_to_band_&
#else
if
(
kernel
.eq.
ELPA_2STAGE_REAL_AVX512_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_AVX512_BLOCK4
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_AVX512_BLOCK6
)
then
kernel
.eq.
ELPA_2STAGE_REAL_AVX512_BLOCK6
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SVE512_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SVE512_BLOCK4
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SVE512_BLOCK6
&
)
then
stripe_width
=
((
stripe_width
+15
)/
16
)
*
16
! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes
...
...
@@ -347,7 +355,10 @@ subroutine trans_ev_tridi_to_band_&
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_AVX512_BLOCK1
.or.
&
kernel
.eq.
ELPA_2STAGE_COMPLEX_AVX512_BLOCK2
)
then
kernel
.eq.
ELPA_2STAGE_COMPLEX_AVX512_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_COMPLEX_SVE512_BLOCK1
.or.
&
kernel
.eq.
ELPA_2STAGE_COMPLEX_SVE512_BLOCK2
&
)
then
stripe_width
=
((
stripe_width
+7
)/
8
)
*
8
! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes
! (4 * sizeof(double complex) == 64)
...
...
@@ -360,7 +371,10 @@ subroutine trans_ev_tridi_to_band_&
#else
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_AVX512_BLOCK1
.or.
&
kernel
.eq.
ELPA_2STAGE_COMPLEX_AVX512_BLOCK2
)
then
kernel
.eq.
ELPA_2STAGE_COMPLEX_AVX512_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_COMPLEX_SVE512_BLOCK1
.or.
&
kernel
.eq.
ELPA_2STAGE_COMPLEX_SVE512_BLOCK2
&
)
then
stripe_width
=
((
stripe_width
+7
)/
8
)
*
8
! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
! (8 * sizeof(float complex) == 64)
...
...
@@ -424,7 +438,11 @@ subroutine trans_ev_tridi_to_band_&
#ifdef DOUBLE_PRECISION_REAL
if
(
kernel
.eq.
ELPA_2STAGE_REAL_AVX512_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_AVX512_BLOCK4
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_AVX512_BLOCK6
)
then
kernel
.eq.
ELPA_2STAGE_REAL_AVX512_BLOCK6
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SVE512_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SVE512_BLOCK4
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SVE512_BLOCK6
&
)
then
stripe_width
=
((
stripe_width
+7
)/
8
)
*
8
! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
! (8 * sizeof(double) == 64)
...
...
@@ -436,7 +454,11 @@ subroutine trans_ev_tridi_to_band_&
#else
if
(
kernel
.eq.
ELPA_2STAGE_REAL_AVX512_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_AVX512_BLOCK4
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_AVX512_BLOCK6
)
then
kernel
.eq.
ELPA_2STAGE_REAL_AVX512_BLOCK6
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SVE512_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SVE512_BLOCK4
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SVE512_BLOCK6
&
)
then
stripe_width
=
((
stripe_width
+15
)/
16
)
*
16
! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes
...
...
@@ -453,7 +475,10 @@ subroutine trans_ev_tridi_to_band_&
#ifdef DOUBLE_PRECISION_COMPLEX
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_AVX512_BLOCK1
.or.
&
kernel
.eq.
ELPA_2STAGE_COMPLEX_AVX512_BLOCK2
)
then
kernel
.eq.
ELPA_2STAGE_COMPLEX_AVX512_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_COMPLEX_SVE512_BLOCK1
.or.
&
kernel
.eq.
ELPA_2STAGE_COMPLEX_SVE512_BLOCK2
&
)
then
stripe_width
=
((
stripe_width
+7
)/
8
)
*
8
! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes
! (4 * sizeof(double complex) == 64)
...
...
@@ -466,7 +491,10 @@ subroutine trans_ev_tridi_to_band_&
#else
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_AVX512_BLOCK1
.or.
&
kernel
.eq.
ELPA_2STAGE_COMPLEX_AVX512_BLOCK2
)
then
kernel
.eq.
ELPA_2STAGE_COMPLEX_AVX512_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_COMPLEX_SVE512_BLOCK1
.or.
&
kernel
.eq.
ELPA_2STAGE_COMPLEX_SVE512_BLOCK2
&
)
then
stripe_width
=
((
stripe_width
+15
)/
16
)
*
16
! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
! (8 * sizeof(float complex) == 64)
...
...
src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c
View file @
40b14bce
...
...
@@ -904,6 +904,19 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
!f>#endif
*/
/*
!f>#if defined(HAVE_SVE512)
!f> interface
!f> subroutine double_hh_trafo_real_SVE512_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_real_SVE512_2hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_double) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f> subroutine double_hh_trafo_real_AVX512_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
...
...
@@ -916,6 +929,19 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
!f> end interface
!f>#endif
*/
/*
!f>#if defined(HAVE_SVE512)
!f> interface
!f> subroutine double_hh_trafo_real_SVE512_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_real_SVE512_2hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_float) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#ifdef HAVE_SSE_INTRINSICS
...
...
@@ -1097,6 +1123,19 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
!f> end interface
!f>#endif
*/
/*
!f>#if defined(HAVE_SVE512)
!f> interface
!f> subroutine quad_hh_trafo_real_SVE512_4hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="quad_hh_trafo_real_SVE512_4hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_double) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#if defined(HAVE_AVX512)
...
...
@@ -1111,6 +1150,20 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
!f> end interface
!f>#endif
*/
/*
!f>#if defined(HAVE_SVE512)
!f> interface
!f> subroutine quad_hh_trafo_real_SVE512_4hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="quad_hh_trafo_real_SVE512_4hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_float) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
...
...
@@ -1287,6 +1340,19 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA