Commit 9f99060c authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'arm_sve' into 'master_pre_stage'

Arm sve

See merge request !49
parents da8452de dd6397c9
......@@ -247,6 +247,27 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
if WITH_REAL_SVE128_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve128_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve128_2hv_single_precision.c
endif
endif
if WITH_REAL_SVE256_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve256_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve256_2hv_single_precision.c
endif
endif
if WITH_REAL_SVE512_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve512_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve512_2hv_single_precision.c
endif
endif
if WITH_REAL_SPARC64_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
......@@ -296,6 +317,27 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
if WITH_REAL_SVE128_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve128_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve128_4hv_single_precision.c
endif
endif
if WITH_REAL_SVE256_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve256_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve256_4hv_single_precision.c
endif
endif
if WITH_REAL_SVE512_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve512_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve512_4hv_single_precision.c
endif
endif
if WITH_REAL_SPARC64_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
......@@ -345,6 +387,27 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
if WITH_REAL_SVE128_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve128_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve128_6hv_single_precision.c
endif
endif
if WITH_REAL_SVE256_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve256_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve256_6hv_single_precision.c
endif
endif
if WITH_REAL_SVE512_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve512_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve512_6hv_single_precision.c
endif
endif
#if WITH_COMPLEX_SPARC64_BLOCK1_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_1hv_double_precision.c
#if WANT_SINGLE_PRECISION_COMPLEX
......@@ -380,7 +443,6 @@ if WANT_SINGLE_PRECISION_COMPLEX
endif
endif
if WITH_COMPLEX_AVX512_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx512_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
......@@ -388,6 +450,27 @@ if WANT_SINGLE_PRECISION_COMPLEX
endif
endif
if WITH_COMPLEX_SVE128_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve128_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve128_1hv_single_precision.c
endif
endif
if WITH_COMPLEX_SVE256_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve256_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve256_1hv_single_precision.c
endif
endif
if WITH_COMPLEX_SVE512_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve512_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve512_1hv_single_precision.c
endif
endif
#if WITH_COMPLEX_SPARC64_BLOCK2_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_2hv_double_precision.c
#if WANT_SINGLE_PRECISION_COMPLEX
......@@ -430,6 +513,27 @@ if WANT_SINGLE_PRECISION_COMPLEX
endif
endif
if WITH_COMPLEX_SVE128_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve128_2hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve128_2hv_single_precision.c
endif
endif
if WITH_COMPLEX_SVE256_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve256_2hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve256_2hv_single_precision.c
endif
endif
if WITH_COMPLEX_SVE512_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve512_2hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve512_2hv_single_precision.c
endif
endif
if STORE_BUILD_CONFIG
libelpa@SUFFIX@_private_la_SOURCES += src/helpers/print_build_config.c
......
......@@ -801,6 +801,14 @@ m4_define(elpa_m4_vsx_kernels, [
real_vsx_block6
])
m4_define(elpa_m4_sve128_kernels, [
real_sve128_block2
real_sve128_block4
real_sve128_block6
complex_sve128_block1
complex_sve128_block2
])
m4_define(elpa_m4_avx_kernels, [
real_avx_block2
real_avx_block4
......@@ -817,6 +825,14 @@ m4_define(elpa_m4_avx2_kernels, [
complex_avx2_block2
])
m4_define(elpa_m4_sve256_kernels, [
real_sve256_block2
real_sve256_block4
real_sve256_block6
complex_sve256_block1
complex_sve256_block2
])
m4_define(elpa_m4_avx512_kernels, [
real_avx512_block2
real_avx512_block4
......@@ -825,6 +841,14 @@ m4_define(elpa_m4_avx512_kernels, [
complex_avx512_block2
])
m4_define(elpa_m4_sve512_kernels, [
real_sve512_block2
real_sve512_block4
real_sve512_block6
complex_sve512_block1
complex_sve512_block2
])
m4_define(elpa_m4_bgp_kernels, [
real_bgp
complex_bgp
......@@ -840,7 +864,7 @@ m4_define(elpa_m4_gpu_kernels, [
complex_gpu
])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 bgp bgq gpu])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq gpu])
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
......@@ -881,6 +905,9 @@ ELPA_SELECT_KERNELS([sse_assembly],[enable])
ELPA_SELECT_KERNELS([avx],[enable])
ELPA_SELECT_KERNELS([avx2],[enable])
ELPA_SELECT_KERNELS([avx512],[enable])
ELPA_SELECT_KERNELS([sve128],[disable])
ELPA_SELECT_KERNELS([sve256],[disable])
ELPA_SELECT_KERNELS([sve512],[disable])
ELPA_SELECT_KERNELS([gpu],[disable])
ELPA_SELECT_KERNELS([bgp],[disable])
ELPA_SELECT_KERNELS([bgq],[disable])
......@@ -890,7 +917,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
])
if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels, [
if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then
echo "Disabling elpa_m4_kernel due to BGP/BGQ option"
fi
......@@ -950,7 +977,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[
])
fi
])
m4_foreach_w([elpa_m4_arch],[sparc64 neon_arch64 vsx sse avx avx2 avx512],[
m4_foreach_w([elpa_m4_arch],[sparc64 neon_arch64 vsx sse avx avx2 avx512 sve128 sve256 sve512],[
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1])
......@@ -1008,7 +1035,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_gpu_kernels,
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_gpu_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[
if test -z "$default_[]elpa_m4_kind[]_kernel"; then
......@@ -1233,6 +1260,69 @@ if test x"${need_avx2}" = x"yes"; then
AC_DEFINE([HAVE_AVX2],[1],[AVX2 is supported on this CPU])
fi
if test x"${need_sve128}" = x"yes"; then
AC_MSG_CHECKING([whether we can compile SVE128 gcc intrinsics in C])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <arm_sve.h>
int main(int argc, char **argv){
double *q;
svfloat64_t q1 = svld1_f64(svptrue_b64(), q);
svfloat64_t y1 = svmad_f64_z(svptrue_b64(), q1, q1, q1);
return 0;
}
])],
[can_compile_sve128=yes],
[can_compile_sve128=no]
)
AC_MSG_RESULT([${can_compile_sve128}])
if test x"$can_compile_sve128" != x"yes"; then
AC_MSG_ERROR([Could not compile a test program with SVE128, adjust the C compiler or CFLAGS. Possibly (some of) the flags " $SIMD_FLAGS " solve this issue])
fi
AC_DEFINE([HAVE_SVE128],[1],[SVE128 is supported on this CPU])
fi
if test x"${need_sve256}" = x"yes"; then
AC_MSG_CHECKING([whether we can compile SVE256 gcc intrinsics in C])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <arm_sve.h>
int main(int argc, char **argv){
double *q;
svfloat64_t q1 = svld1_f64(svptrue_b64(), q);
svfloat64_t y1 = svmad_f64_z(svptrue_b64(), q1, q1, q1);
return 0;
}
])],
[can_compile_sve256=yes],
[can_compile_sve256=no]
)
AC_MSG_RESULT([${can_compile_sve256}])
if test x"$can_compile_sve256" != x"yes"; then
AC_MSG_ERROR([Could not compile a test program with SVE256, adjust the C compiler or CFLAGS. Possibly (some of) the flags " $SIMD_FLAGS " solve this issue])
fi
AC_DEFINE([HAVE_SVE256],[1],[SVE256 is supported on this CPU])
fi
if test x"${need_sve512}" = x"yes"; then
AC_MSG_CHECKING([whether we can compile SVE512 gcc intrinsics in C])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <arm_sve.h>
int main(int argc, char **argv){
double *q;
svfloat64_t q1 = svld1_f64(svptrue_b64(), q);
svfloat64_t y1 = svmad_f64_z(svptrue_b64(), q1, q1, q1);
return 0;
}
])],
[can_compile_sve512=yes],
[can_compile_sve512=no]
)
AC_MSG_RESULT([${can_compile_sve512}])
if test x"$can_compile_sve512" != x"yes"; then
AC_MSG_ERROR([Could not compile a test program with SVE512, adjust the C compiler or CFLAGS. Possibly (some of) the flags " $SIMD_FLAGS " solve this issue])
fi
AC_DEFINE([HAVE_SVE512],[1],[SVE512 is supported on this CPU])
fi
if test x"${need_avx512}" = x"yes"; then
AC_MSG_CHECKING([whether we can compile AVX512 gcc intrinsics in C])
......
......@@ -60,8 +60,17 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 25, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 26, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 27, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 28, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 29, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_REAL_SVE128_BLOCK2, 28, @ELPA_2STAGE_REAL_SVE128_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK4, 29, @ELPA_2STAGE_REAL_SVE128_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK6, 30, @ELPA_2STAGE_REAL_SVE128_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK2, 31, @ELPA_2STAGE_REAL_SVE256_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK4, 32, @ELPA_2STAGE_REAL_SVE256_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK6, 33, @ELPA_2STAGE_REAL_SVE256_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK2, 34, @ELPA_2STAGE_REAL_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK4, 35, @ELPA_2STAGE_REAL_SVE512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK6, 36, @ELPA_2STAGE_REAL_SVE512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 37, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 38, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
......@@ -87,7 +96,13 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2, 11, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1, 12, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2, 13, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_SVE128_BLOCK1, 14, @ELPA_2STAGE_COMPLEX_SVE128_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE128_BLOCK2, 15, @ELPA_2STAGE_COMPLEX_SVE128_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE256_BLOCK1, 16, @ELPA_2STAGE_COMPLEX_SVE256_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE256_BLOCK2, 17, @ELPA_2STAGE_COMPLEX_SVE256_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK1, 18, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK2, 19, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 20, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
......
......@@ -9,5 +9,6 @@
#define VSX_INSTR 9
#define ARCH64_INSTR 10
#define SPARC_INSTR 11
#define SVE512_INSTR 12
#define NUMBER_OF_INSTR 12
#define NUMBER_OF_INSTR 13
This diff is collapsed.
......@@ -319,7 +319,11 @@ subroutine trans_ev_tridi_to_band_&
#ifdef DOUBLE_PRECISION_REAL
if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK4 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK6 &
) then
stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
! (8 * sizeof(double) == 64)
......@@ -331,7 +335,11 @@ subroutine trans_ev_tridi_to_band_&
#else
if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK4 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK6 &
) then
stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes
......@@ -347,7 +355,10 @@ subroutine trans_ev_tridi_to_band_&
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK1 .or. &
kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK2 &
) then
stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes
! (4 * sizeof(double complex) == 64)
......@@ -360,7 +371,10 @@ subroutine trans_ev_tridi_to_band_&
#else
if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK1 .or. &
kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK2 &
) then
stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
! (8 * sizeof(float complex) == 64)
......@@ -424,7 +438,11 @@ subroutine trans_ev_tridi_to_band_&
#ifdef DOUBLE_PRECISION_REAL
if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK4 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK6 &
) then
stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
! (8 * sizeof(double) == 64)
......@@ -436,7 +454,11 @@ subroutine trans_ev_tridi_to_band_&
#else
if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK4 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK6 &
) then
stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes
......@@ -453,7 +475,10 @@ subroutine trans_ev_tridi_to_band_&
#ifdef DOUBLE_PRECISION_COMPLEX
if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK1 .or. &
kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK2 &
) then
stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes
! (4 * sizeof(double complex) == 64)
......@@ -466,7 +491,10 @@ subroutine trans_ev_tridi_to_band_&
#else
if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK1 .or. &
kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK2 &
) then
stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
! (8 * sizeof(float complex) == 64)
......
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek, MPCDF
#include "config-f90.h"
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK1 1
#define VEC_SET SVE_128
#include "../../general/precision_macros.h"
#include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK1
#undef DOUBLE_PRECISION
#undef COMPLEXCASE
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek, MPCDF
#include "config-f90.h"
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#define BLOCK1 1
#define VEC_SET SVE_128
#include "../../general/precision_macros.h"
#include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK1
#undef SINGLE_PRECISION
#undef COMPLEXCASE
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek, MPCDF
#include "config-f90.h"
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK2 1
#define VEC_SET SVE_128
#include "../../general/precision_macros.h"
#include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK2
#undef DOUBLE_PRECISION
#undef COMPLEXCASE