Commit 40b14bce authored by Andreas Marek's avatar Andreas Marek
Browse files

Make SVE512 kernels known in ELPA

parent 95597586
...@@ -825,12 +825,12 @@ m4_define(elpa_m4_avx512_kernels, [ ...@@ -825,12 +825,12 @@ m4_define(elpa_m4_avx512_kernels, [
complex_avx512_block2 complex_avx512_block2
]) ])
m4_define(elpa_m4_vse512_kernels, [ m4_define(elpa_m4_sve512_kernels, [
real_vse512_block2 real_sve512_block2
real_vse512_block4 real_sve512_block4
real_vse512_block6 real_sve512_block6
complex_vse512_block1 complex_sve512_block1
complex_vse512_block2 complex_sve512_block2
]) ])
m4_define(elpa_m4_bgp_kernels, [ m4_define(elpa_m4_bgp_kernels, [
...@@ -848,7 +848,7 @@ m4_define(elpa_m4_gpu_kernels, [ ...@@ -848,7 +848,7 @@ m4_define(elpa_m4_gpu_kernels, [
complex_gpu complex_gpu
]) ])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 vse512 bgp bgq gpu]) m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 sve512 bgp bgq gpu])
m4_define(elpa_m4_all_kernels, m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type], m4_foreach_w([elpa_m4_type],
...@@ -889,7 +889,7 @@ ELPA_SELECT_KERNELS([sse_assembly],[enable]) ...@@ -889,7 +889,7 @@ ELPA_SELECT_KERNELS([sse_assembly],[enable])
ELPA_SELECT_KERNELS([avx],[enable]) ELPA_SELECT_KERNELS([avx],[enable])
ELPA_SELECT_KERNELS([avx2],[enable]) ELPA_SELECT_KERNELS([avx2],[enable])
ELPA_SELECT_KERNELS([avx512],[enable]) ELPA_SELECT_KERNELS([avx512],[enable])
ELPA_SELECT_KERNELS([vse512],[disable]) ELPA_SELECT_KERNELS([sve512],[disable])
ELPA_SELECT_KERNELS([gpu],[disable]) ELPA_SELECT_KERNELS([gpu],[disable])
ELPA_SELECT_KERNELS([bgp],[disable]) ELPA_SELECT_KERNELS([bgp],[disable])
ELPA_SELECT_KERNELS([bgq],[disable]) ELPA_SELECT_KERNELS([bgq],[disable])
...@@ -899,7 +899,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[ ...@@ -899,7 +899,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
]) ])
if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels elpa_m4_vse512_kernels, [ m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels elpa_m4_sve512_kernels, [
if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then
echo "Disabling elpa_m4_kernel due to BGP/BGQ option" echo "Disabling elpa_m4_kernel due to BGP/BGQ option"
fi fi
...@@ -959,7 +959,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[ ...@@ -959,7 +959,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[
]) ])
fi fi
]) ])
m4_foreach_w([elpa_m4_arch],[sparc64 neon_arch64 vsx sse avx avx2 avx512 vse512],[ m4_foreach_w([elpa_m4_arch],[sparc64 neon_arch64 vsx sse avx avx2 avx512 sve512],[
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2]) ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2]) ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1]) ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1])
...@@ -1017,7 +1017,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[ ...@@ -1017,7 +1017,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel], m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel], m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_vse512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_gpu_kernels, elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_gpu_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ), [m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[ [
if test -z "$default_[]elpa_m4_kind[]_kernel"; then if test -z "$default_[]elpa_m4_kind[]_kernel"; then
...@@ -1280,6 +1280,8 @@ if test x"${need_avx512}" = x"yes"; then ...@@ -1280,6 +1280,8 @@ if test x"${need_avx512}" = x"yes"; then
) )
AC_MSG_RESULT([${can_compile_avx512_xeon}]) AC_MSG_RESULT([${can_compile_avx512_xeon}])
can_compile_sve512_xeon = "no"
AC_MSG_CHECKING([whether we compile for Xeon PHI]) AC_MSG_CHECKING([whether we compile for Xeon PHI])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([ AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h> #include <x86intrin.h>
......
...@@ -60,9 +60,9 @@ enum ELPA_SOLVERS { ...@@ -60,9 +60,9 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 25, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_VSX_BLOCK2, 25, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 26, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_VSX_BLOCK4, 26, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 27, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_VSX_BLOCK6, 27, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSE512_BLOCK2, 28, @ELPA_2STAGE_REAL_VSE512_BLOCK2_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_SVE512_BLOCK2, 28, @ELPA_2STAGE_REAL_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSE512_BLOCK4, 29, @ELPA_2STAGE_REAL_VSE512_BLOCK4_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_SVE512_BLOCK4, 29, @ELPA_2STAGE_REAL_SVE512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSE512_BLOCK6, 30, @ELPA_2STAGE_REAL_VSE512_BLOCK6_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_SVE512_BLOCK6, 30, @ELPA_2STAGE_REAL_SVE512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 31, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 31, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 32, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__) X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 32, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
...@@ -90,7 +90,9 @@ enum ELPA_REAL_KERNELS { ...@@ -90,7 +90,9 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2, 11, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2, 11, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1, 12, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1, 12, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2, 13, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2, 13, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__) X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK1, 14, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK2, 15, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 16, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \ #define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \ ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
......
...@@ -9,6 +9,6 @@ ...@@ -9,6 +9,6 @@
#define VSX_INSTR 9 #define VSX_INSTR 9
#define ARCH64_INSTR 10 #define ARCH64_INSTR 10
#define SPARC_INSTR 11 #define SPARC_INSTR 11
#define VSE512_INSTR 12 #define SVE512_INSTR 12
#define NUMBER_OF_INSTR 13 #define NUMBER_OF_INSTR 13
...@@ -316,6 +316,7 @@ kernel) ...@@ -316,6 +316,7 @@ kernel)
kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. & kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2 .or. & kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. & kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_GENERIC .or. & kernel .eq. ELPA_2STAGE_REAL_GENERIC .or. &
kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. & kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. &
kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY .or. & kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY .or. &
...@@ -779,6 +780,7 @@ kernel) ...@@ -779,6 +780,7 @@ kernel)
#if REALCASE == 1 #if REALCASE == 1
! no avx512 block1 real kernel ! no avx512 block1 real kernel
! no sve512 block1 real kernel
#endif /* REALCASE */ #endif /* REALCASE */
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
...@@ -812,6 +814,37 @@ kernel) ...@@ -812,6 +814,37 @@ kernel)
endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1)) endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1))
#endif /* not WITH_FIXED_COMPLEX_KERNEL */ #endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX512_BLOCK1_KERNEL */ #endif /* WITH_COMPLEX_AVX512_BLOCK1_KERNEL */
! sve512 block1 complex kernel
#if defined(WITH_COMPLEX_SVE512_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if ((kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK1)) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SVE512_BLOCK2_KERNEL) )
ttt = mpi_wtime()
do j = ncols, 1, -1
#ifdef WITH_OPENMP_TRADITIONAL
call single_hh_trafo_&
&MATH_DATATYPE&
&_sve512_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_&
&MATH_DATATYPE&
&_sve512_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SVE512_BLOCK2_KERNEL) ) */
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK1))
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SVE512_BLOCK1_KERNEL */
#endif /* COMPLEXCASE */ #endif /* COMPLEXCASE */
#if REALCASE == 1 #if REALCASE == 1
...@@ -1295,6 +1328,43 @@ kernel) ...@@ -1295,6 +1328,43 @@ kernel)
#endif /* not WITH_FIXED_REAL_KERNEL */ #endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_AVX512_BLOCK2_KERNEL */ #endif /* WITH_REAL_AVX512_BLOCK2_KERNEL */
! implementation of sve512 block 2 real case
#if defined(WITH_REAL_SVE512_BLOCK2_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if ((kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK2)) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SVE512_BLOCK6_KERNEL) && !defined(WITH_REAL_SVE512_BLOCK4_KERNEL))
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP_TRADITIONAL
call double_hh_trafo_&
&MATH_DATATYPE&
&_sve512_2hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_sve512_2hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) ... */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SVE512_BLOCK2_KERNEL */
#endif /* REALCASE */ #endif /* REALCASE */
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
...@@ -1341,6 +1411,50 @@ kernel) ...@@ -1341,6 +1411,50 @@ kernel)
endif ! ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2)) endif ! ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2))
#endif /* not WITH_FIXED_COMPLEX_KERNEL */ #endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX512_BLOCK2_KERNEL */ #endif /* WITH_COMPLEX_AVX512_BLOCK2_KERNEL */
! implementation of vse512 block 2 complex case
#if defined(WITH_COMPLEX_SVE512_BLOCK2_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if ( (kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK2)) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP_TRADITIONAL
call double_hh_trafo_&
&MATH_DATATYPE&
&_sve512_2hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_sve512_2hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP_TRADITIONAL
if (j==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_sve512_1hv_&
&PRECISION&
& (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
if (j==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_sve512_1hv_&
&PRECISION&
& (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! ( (kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK2))
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SVE512_BLOCK2_KERNEL */
#endif /* COMPLEXCASE */ #endif /* COMPLEXCASE */
...@@ -2191,10 +2305,80 @@ kernel) ...@@ -2191,10 +2305,80 @@ kernel)
#endif /* not WITH_FIXED_REAL_KERNEL */ #endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_AVX512_BLOCK4_KERNEL */ #endif /* WITH_REAL_AVX512_BLOCK4_KERNEL */
! sve512 block4 real kernel
#if defined(WITH_REAL_SVE512_BLOCK4_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK4) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SVE512_BLOCK6_KERNEL))
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
#ifdef WITH_OPENMP_TRADITIONAL
call quad_hh_trafo_&
&MATH_DATATYPE&
&_sve512_4hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_&
&MATH_DATATYPE&
&_sve512_4hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
do jj = j, 2, -2
w(:,1) = bcast_buffer(1:nbw,jj+off)
w(:,2) = bcast_buffer(1:nbw,jj+off-1)
#ifdef WITH_OPENMP_TRADITIONAL
call double_hh_trafo_&
&MATH_DATATYPE&
&_sve512_2hv_&
&PRECISION&
& (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_sve512_2hv_&
&PRECISION&
& (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP_TRADITIONAL
if (jj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) ) */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SVE512_BLOCK4_KERNEL */
#endif /* REALCASE */ #endif /* REALCASE */
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
!no avx512 block4 complex kernel !no avx512 block4 complex kernel
!no sve512 block4 complex kernel
#endif /* COMPLEXCASE */ #endif /* COMPLEXCASE */
...@@ -2812,10 +2996,95 @@ kernel) ...@@ -2812,10 +2996,95 @@ kernel)
#endif /* not WITH_FIXED_REAL_KERNEL */ #endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_AVX512_BLOCK6_KERNEL */ #endif /* WITH_REAL_AVX512_BLOCK6_KERNEL */
! sve512 block6 kernel
#if defined(WITH_REAL_SVE512_BLOCK6_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if ((kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK6)) then
#endif /* not WITH_FIXED_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
do j = ncols, 6, -6
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
w(:,5) = bcast_buffer(1:nbw,j+off-4)
w(:,6) = bcast_buffer(1:nbw,j+off-5)
#ifdef WITH_OPENMP_TRADITIONAL
call hexa_hh_trafo_&
&MATH_DATATYPE&
&_sve512_6hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call hexa_hh_trafo_&
&MATH_DATATYPE&
&_sve512_6hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
do jj = j, 4, -4
w(:,1) = bcast_buffer(1:nbw,jj+off)
w(:,2) = bcast_buffer(1:nbw,jj+off-1)
w(:,3) = bcast_buffer(1:nbw,jj+off-2)
w(:,4) = bcast_buffer(1:nbw,jj+off-3)
#ifdef WITH_OPENMP_TRADITIONAL
call quad_hh_trafo_&
&MATH_DATATYPE&
&_sve512_4hv_&
&PRECISION&
& (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_&
&MATH_DATATYPE&
&_sve512_4hv_&
&PRECISION&
& (c_loc(a(1,jj+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
do jjj = jj, 2, -2
w(:,1) = bcast_buffer(1:nbw,jjj+off)
w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
#ifdef WITH_OPENMP_TRADITIONAL
call double_hh_trafo_&
&MATH_DATATYPE&
&_sve512_2hv_&
&PRECISION&
& (c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_sve512_2hv_&
&PRECISION&
& (c_loc(a(1,jjj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP_TRADITIONAL
if (jjj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jjj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SVE512_BLOCK6_KERNEL */
#endif /* REALCASE */ #endif /* REALCASE */
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
!no avx512 block6 complex kernel !no avx512 block6 complex kernel
!no sve512 block6 complex kernel
#endif /* COMPLEXCASE */ #endif /* COMPLEXCASE */
if (wantDebug) then if (wantDebug) then
......
...@@ -319,7 +319,11 @@ subroutine trans_ev_tridi_to_band_& ...@@ -319,7 +319,11 @@ subroutine trans_ev_tridi_to_band_&
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. & if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. & kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK4 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK6 &
) then
stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
! (8 * sizeof(double) == 64) ! (8 * sizeof(double) == 64)
...@@ -331,7 +335,11 @@ subroutine trans_ev_tridi_to_band_& ...@@ -331,7 +335,11 @@ subroutine trans_ev_tridi_to_band_&
#else #else
if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. & if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. & kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK4 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK6 &
) then
stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes
...@@ -347,7 +355,10 @@ subroutine trans_ev_tridi_to_band_& ...@@ -347,7 +355,10 @@ subroutine trans_ev_tridi_to_band_&
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. & if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK1 .or. &
kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK2 &
) then
stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes
! (4 * sizeof(double complex) == 64) ! (4 * sizeof(double complex) == 64)
...@@ -360,7 +371,10 @@ subroutine trans_ev_tridi_to_band_& ...@@ -360,7 +371,10 @@ subroutine trans_ev_tridi_to_band_&
#else #else
if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. & if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK1 .or. &
kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK2 &
) then
stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
! (8 * sizeof(float complex) == 64) ! (8 * sizeof(float complex) == 64)
...@@ -424,7 +438,11 @@ subroutine trans_ev_tridi_to_band_& ...@@ -424,7 +438,11 @@ subroutine trans_ev_tridi_to_band_&
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. & if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. & kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK4 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK6 &
) then
stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
! (8 * sizeof(double) == 64) ! (8 * sizeof(double) == 64)
...@@ -436,7 +454,11 @@ subroutine trans_ev_tridi_to_band_& ...@@ -436,7 +454,11 @@ subroutine trans_ev_tridi_to_band_&
#else #else
if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. & if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. & kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK4 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK6 &
</