Commit 4d44a9c5 authored by Andreas Marek's avatar Andreas Marek
Browse files

Kernels for SVE 128, 256 and 512 bit kernels

- at the moment no runtime detection whether processor
  SVE vector length is 128, 256 or 512 bit
parent 6bbe9235
......@@ -247,6 +247,20 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
if WITH_REAL_SVE128_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve128_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve128_2hv_single_precision.c
endif
endif
if WITH_REAL_SVE256_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve256_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve256_2hv_single_precision.c
endif
endif
if WITH_REAL_SVE512_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve512_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
......@@ -303,6 +317,20 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
if WITH_REAL_SVE128_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve128_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve128_4hv_single_precision.c
endif
endif
if WITH_REAL_SVE256_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve256_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve256_4hv_single_precision.c
endif
endif
if WITH_REAL_SVE512_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve512_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
......@@ -359,6 +387,20 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
if WITH_REAL_SVE128_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve128_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve128_6hv_single_precision.c
endif
endif
if WITH_REAL_SVE256_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve256_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve256_6hv_single_precision.c
endif
endif
if WITH_REAL_SVE512_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve512_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
......@@ -408,6 +450,20 @@ if WANT_SINGLE_PRECISION_COMPLEX
endif
endif
if WITH_COMPLEX_SVE128_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve128_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve128_1hv_single_precision.c
endif
endif
if WITH_COMPLEX_SVE256_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve256_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve256_1hv_single_precision.c
endif
endif
if WITH_COMPLEX_SVE512_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve512_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
......@@ -457,6 +513,20 @@ if WANT_SINGLE_PRECISION_COMPLEX
endif
endif
if WITH_COMPLEX_SVE128_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve128_2hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve128_2hv_single_precision.c
endif
endif
if WITH_COMPLEX_SVE256_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve256_2hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve256_2hv_single_precision.c
endif
endif
if WITH_COMPLEX_SVE512_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve512_2hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
......
......@@ -801,6 +801,14 @@ m4_define(elpa_m4_vsx_kernels, [
real_vsx_block6
])
m4_define(elpa_m4_sve128_kernels, [
real_sve128_block2
real_sve128_block4
real_sve128_block6
complex_sve128_block1
complex_sve128_block2
])
m4_define(elpa_m4_avx_kernels, [
real_avx_block2
real_avx_block4
......@@ -817,6 +825,14 @@ m4_define(elpa_m4_avx2_kernels, [
complex_avx2_block2
])
m4_define(elpa_m4_sve256_kernels, [
real_sve256_block2
real_sve256_block4
real_sve256_block6
complex_sve256_block1
complex_sve256_block2
])
m4_define(elpa_m4_avx512_kernels, [
real_avx512_block2
real_avx512_block4
......@@ -848,7 +864,7 @@ m4_define(elpa_m4_gpu_kernels, [
complex_gpu
])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 sve512 bgp bgq gpu])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq gpu])
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
......@@ -889,6 +905,8 @@ ELPA_SELECT_KERNELS([sse_assembly],[enable])
ELPA_SELECT_KERNELS([avx],[enable])
ELPA_SELECT_KERNELS([avx2],[enable])
ELPA_SELECT_KERNELS([avx512],[enable])
ELPA_SELECT_KERNELS([sve128],[disable])
ELPA_SELECT_KERNELS([sve256],[disable])
ELPA_SELECT_KERNELS([sve512],[disable])
ELPA_SELECT_KERNELS([gpu],[disable])
ELPA_SELECT_KERNELS([bgp],[disable])
......@@ -899,7 +917,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
])
if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels elpa_m4_sve512_kernels, [
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels, [
if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then
echo "Disabling elpa_m4_kernel due to BGP/BGQ option"
fi
......@@ -959,7 +977,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[
])
fi
])
m4_foreach_w([elpa_m4_arch],[sparc64 neon_arch64 vsx sse avx avx2 avx512 sve512],[
m4_foreach_w([elpa_m4_arch],[sparc64 neon_arch64 vsx sse avx avx2 avx512 sve128 sve256 sve512],[
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1])
......@@ -1017,7 +1035,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_gpu_kernels,
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_gpu_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[
if test -z "$default_[]elpa_m4_kind[]_kernel"; then
......@@ -1242,6 +1260,48 @@ if test x"${need_avx2}" = x"yes"; then
AC_DEFINE([HAVE_AVX2],[1],[AVX2 is supported on this CPU])
fi
if test x"${need_sve128}" = x"yes"; then
AC_MSG_CHECKING([whether we can compile SVE128 gcc intrinsics in C])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <arm_sve.h>
int main(int argc, char **argv){
double *q;
svfloat64_t q1 = svld1_f64(svptrue_b64(), q);
svfloat64_t y1 = svmad_f64_z(svptrue_b64(), q1, q1, q1);
return 0;
}
])],
[can_compile_sve128=yes],
[can_compile_sve128=no]
)
AC_MSG_RESULT([${can_compile_sve128}])
if test x"$can_compile_sve128" != x"yes"; then
AC_MSG_ERROR([Could not compile a test program with SVE128, adjust the C compiler or CFLAGS. Possibly (some of) the flags " $SIMD_FLAGS " solve this issue])
fi
AC_DEFINE([HAVE_SVE128],[1],[SVE128 is supported on this CPU])
fi
if test x"${need_sve256}" = x"yes"; then
AC_MSG_CHECKING([whether we can compile SVE256 gcc intrinsics in C])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <arm_sve.h>
int main(int argc, char **argv){
double *q;
svfloat64_t q1 = svld1_f64(svptrue_b64(), q);
svfloat64_t y1 = svmad_f64_z(svptrue_b64(), q1, q1, q1);
return 0;
}
])],
[can_compile_sve256=yes],
[can_compile_sve256=no]
)
AC_MSG_RESULT([${can_compile_sve256}])
if test x"$can_compile_sve256" != x"yes"; then
AC_MSG_ERROR([Could not compile a test program with SVE256, adjust the C compiler or CFLAGS. Possibly (some of) the flags " $SIMD_FLAGS " solve this issue])
fi
AC_DEFINE([HAVE_SVE256],[1],[SVE256 is supported on this CPU])
fi
if test x"${need_sve512}" = x"yes"; then
AC_MSG_CHECKING([whether we can compile SVE512 gcc intrinsics in C])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
......
......@@ -60,11 +60,17 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 25, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 26, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 27, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK2, 28, @ELPA_2STAGE_REAL_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK4, 29, @ELPA_2STAGE_REAL_SVE512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK6, 30, @ELPA_2STAGE_REAL_SVE512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 31, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 32, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_REAL_SVE128_BLOCK2, 28, @ELPA_2STAGE_REAL_SVE128_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK4, 29, @ELPA_2STAGE_REAL_SVE128_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK6, 30, @ELPA_2STAGE_REAL_SVE128_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK2, 31, @ELPA_2STAGE_REAL_SVE256_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK4, 32, @ELPA_2STAGE_REAL_SVE256_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK6, 33, @ELPA_2STAGE_REAL_SVE256_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK2, 34, @ELPA_2STAGE_REAL_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK4, 35, @ELPA_2STAGE_REAL_SVE512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK6, 36, @ELPA_2STAGE_REAL_SVE512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 37, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 38, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
......@@ -90,9 +96,13 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2, 11, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1, 12, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2, 13, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK1, 14, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK2, 15, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 16, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_SVE128_BLOCK1, 14, @ELPA_2STAGE_COMPLEX_SVE128_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE128_BLOCK2, 15, @ELPA_2STAGE_COMPLEX_SVE128_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE256_BLOCK1, 16, @ELPA_2STAGE_COMPLEX_SVE256_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE256_BLOCK2, 17, @ELPA_2STAGE_COMPLEX_SVE256_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK1, 18, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK2, 19, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 20, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
......
......@@ -316,6 +316,8 @@ kernel)
kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE128_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE256_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_GENERIC .or. &
kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. &
......@@ -602,7 +604,7 @@ kernel)
#endif /* COMPLEXCASE */
#if REALCASE == 1
! no sse, vsx, sparc64 block1 real kernel
! no sse, vsx, sparc64 sve block1 real kernel
#endif
#if COMPLEXCASE == 1
......@@ -702,6 +704,36 @@ kernel)
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNEL */
! sve128 block1 complex kernel
#if defined(WITH_COMPLEX_SVE128_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if (kernel .eq. ELPA_2STAGE_COMPLEX_SVE128_BLOCK1) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SVE128_BLOCK2_KERNEL))
ttt = mpi_wtime()
do j = ncols, 1, -1
#ifdef WITH_OPENMP_TRADITIONAL
call single_hh_trafo_&
&MATH_DATATYPE&
&_sve128_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_&
&MATH_DATATYPE&
&_sve128_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
......@@ -774,7 +806,34 @@ kernel)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX2_BLOCK1_KERNEL */
#if defined(WITH_COMPLEX_SVE256_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if ((kernel .eq. ELPA_2STAGE_COMPLEX_SVE256_BLOCK1)) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SVE256_BLOCK2_KERNEL))
ttt = mpi_wtime()
do j = ncols, 1, -1
#ifdef WITH_OPENMP_TRADITIONAL
call single_hh_trafo_&
&MATH_DATATYPE&
&_sve256_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_&
&MATH_DATATYPE&
&_sve256_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SVE256_BLOCK2_KERNEL)) */
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_SVE256_BLOCK1))
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SVE256_BLOCK1_KERNEL */
#endif /* COMPLEXCASE */
......@@ -917,8 +976,40 @@ kernel)
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL */
#endif /* REALCASE == 1 */
! implementation of neon_arch64 block 2 real case
#if defined(WITH_REAL_SVE128_BLOCK2_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_SVE128_BLOCK2) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SVE128_BLOCK6_KERNEL) && !defined(WITH_REAL_SVE128_BLOCK4_KERNEL))
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP_TRADITIONAL
call double_hh_trafo_&
&MATH_DATATYPE&
&_sve128_2hv_&
&PRECISION &
& (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_sve128_2hv_&
&PRECISION &
& (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SVE128_BLOCK6_KERNEL) && !defined(WITH_REAL_SVE128_BLOCK4_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SVE128_BLOCK2_KERNEL */
#endif /* REALCASE == 1 */
#if REALCASE == 1
......@@ -1131,6 +1222,51 @@ kernel)
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK2)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */
! implementation of sve128 block 2 complex case
#if defined(WITH_COMPLEX_SVE128_BLOCK2_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if (kernel .eq. ELPA_2STAGE_COMPLEX_SVE128_BLOCK2) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP_TRADITIONAL
call double_hh_trafo_&
&MATH_DATATYPE&
&_sve128_2hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_sve128_2hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP_TRADITIONAL
if (j==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_sve128_1hv_&
&PRECISION&
& (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
if (j==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_sve128_1hv_&
&PRECISION&
& (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SVE128_BLOCK2)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SVE128_BLOCK2_KERNEL */
#endif /* COMPLEXCASE == 1 */
#if REALCASE == 1
......@@ -1201,6 +1337,40 @@ kernel)
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_AVX2_BLOCK2_KERNEL */
#if defined(WITH_REAL_SVE256_BLOCK2_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if ((kernel .eq. ELPA_2STAGE_REAL_SVE256_BLOCK2)) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SVE256_BLOCK6_KERNEL) && !defined(WITH_REAL_SVE256_BLOCK4_KERNEL))
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP_TRADITIONAL
call double_hh_trafo_&
&MATH_DATATYPE&
&_sve256_2hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_sve256_2hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) ... */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SVE256_BLOCK2_KERNEL */
#endif /* REALCASE */
#if COMPLEXCASE == 1
......@@ -1290,6 +1460,50 @@ kernel)
endif ! ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK2) )
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX2_BLOCK2_KERNEL */
! implementation of sve256 block 2 complex case
#if defined(WITH_COMPLEX_SVE256_BLOCK2_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if ( (kernel .eq. ELPA_2STAGE_COMPLEX_SVE256_BLOCK2) ) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP_TRADITIONAL
call double_hh_trafo_&
&MATH_DATATYPE&
&_sve256_2hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_sve256_2hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP_TRADITIONAL
if (j==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_sve256_1hv_&
&PRECISION&
& (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
if (j==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_sve256_1hv_&
&PRECISION&
& (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! ( (kernel .eq. ELPA_2STAGE_COMPLEX_SVE256_BLOCK2) )
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SVE256_BLOCK2_KERNEL */
#endif /* COMPLEXCASE */
#if REALCASE == 1
......@@ -1949,6 +2163,75 @@ kernel)
#endif /* REALCASE */
#if REALCASE == 1
! sve128 block 4 real kernel
#if defined(WITH_REAL_SVE128_BLOCK4_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_SVE128_BLOCK4) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SVE128_BLOCK6_KERNEL))
do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
#ifdef WITH_OPENMP_TRADITIONAL
call quad_hh_trafo_&
&MATH_DATATYPE&
&_sve128_4hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_&
&MATH_DATATYPE&
&_sve128_4hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
do jj = j, 2, -2
w(:,1) = bcast_buffer(1:nbw,jj+off)
w(:,2) = bcast_buffer(1:nbw,jj+off-1)
#ifdef WITH_OPENMP_TRADITIONAL
call double_hh_trafo_&
&MATH_DATATYPE&
&_sve128_2hv_&
&PRECISION&
& (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_sve128_2hv_&
&PRECISION&
& (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP_TRADITIONAL
if (jj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SVE128_BLOCK6_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SVE128_BLOCK4_KERNEL */
#endif /* REALCASE */
#if REALCASE == 1
! vsx block4 real kernel
......@@ -2230,6 +2513,74 @@ kernel)
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_AVX2_BLOCK4_KERNEL */
! sve256 block4 real kernel
#if defined(WITH_REAL_SVE256_BLOCK4_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL