There is a maintenance of MPCDF Gitlab on Thursday, April 22st 2020, 9:00 am CEST - Expect some service interruptions during this time

Commit 4d0b0ab1 authored by Andreas Marek's avatar Andreas Marek

Start to implement real NEON ARCH64 kernels

parent b844cf4d
...@@ -908,10 +908,14 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \ ...@@ -908,10 +908,14 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_6hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/real_sse_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_4hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx512_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/asm_x86_64_double_precision.s \ @top_srcdir@/src/elpa2/kernels/asm_x86_64_double_precision.s \
@top_srcdir@/src/elpa2/kernels/real_avx512_4hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx512_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_6hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_sse_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/asm_x86_64_single_precision.s \ @top_srcdir@/src/elpa2/kernels/asm_x86_64_single_precision.s \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_single_precision.c \
...@@ -929,19 +933,27 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \ ...@@ -929,19 +933,27 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_bgq.f90 \ @top_srcdir@/src/elpa2/kernels/real_bgq.f90 \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/real_sse_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_template.c \ @top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_simple.F90 \ @top_srcdir@/src/elpa2/kernels/real_simple.F90 \
@top_srcdir@/src/elpa2/kernels/complex.F90 \ @top_srcdir@/src/elpa2/kernels/complex.F90 \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_template.c \ @top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_template.c \ @top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_sse_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_2hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_sse_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_bgp.f90 \ @top_srcdir@/src/elpa2/kernels/real_bgp.f90 \
@top_srcdir@/src/elpa2/kernels/real_sse_4hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_sse_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_6hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx512_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_2hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_sse_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_6hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx512_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_4hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/real_sse_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c \
@top_srcdir@/src/elpa2/elpa2_compute_complex_template.F90 \ @top_srcdir@/src/elpa2/elpa2_compute_complex_template.F90 \
@top_srcdir@/src/elpa2/elpa2_bandred_template.F90 \ @top_srcdir@/src/elpa2/elpa2_bandred_template.F90 \
@top_srcdir@/src/elpa2/pack_unpack_gpu.F90 \ @top_srcdir@/src/elpa2/pack_unpack_gpu.F90 \
......
...@@ -227,6 +227,13 @@ if WITH_REAL_SPARC64_BLOCK2_KERNEL ...@@ -227,6 +227,13 @@ if WITH_REAL_SPARC64_BLOCK2_KERNEL
#endif #endif
endif endif
if WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c
endif
endif
if WITH_REAL_VSX_BLOCK2_KERNEL if WITH_REAL_VSX_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_2hv_double_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL if WANT_SINGLE_PRECISION_REAL
...@@ -269,6 +276,13 @@ if WITH_REAL_SPARC64_BLOCK4_KERNEL ...@@ -269,6 +276,13 @@ if WITH_REAL_SPARC64_BLOCK4_KERNEL
#endif #endif
endif endif
if WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c
endif
endif
if WITH_REAL_VSX_BLOCK4_KERNEL if WITH_REAL_VSX_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_double_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL if WANT_SINGLE_PRECISION_REAL
...@@ -311,6 +325,13 @@ if WITH_REAL_SPARC64_BLOCK6_KERNEL ...@@ -311,6 +325,13 @@ if WITH_REAL_SPARC64_BLOCK6_KERNEL
#endif #endif
endif endif
if WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c
endif
endif
if WITH_REAL_VSX_BLOCK6_KERNEL if WITH_REAL_VSX_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_6hv_double_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL if WANT_SINGLE_PRECISION_REAL
......
...@@ -636,6 +636,12 @@ m4_define(elpa_m4_sparc64_kernels, [ ...@@ -636,6 +636,12 @@ m4_define(elpa_m4_sparc64_kernels, [
real_sparc64_block6 real_sparc64_block6
]) ])
m4_define(elpa_m4_neon_arch64_kernels, [
real_neon_arch64_block2
real_neon_arch64_block4
real_neon_arch64_block6
])
m4_define(elpa_m4_vsx_kernels, [ m4_define(elpa_m4_vsx_kernels, [
real_vsx_block2 real_vsx_block2
real_vsx_block4 real_vsx_block4
...@@ -681,7 +687,7 @@ m4_define(elpa_m4_gpu_kernels, [ ...@@ -681,7 +687,7 @@ m4_define(elpa_m4_gpu_kernels, [
complex_gpu complex_gpu
]) ])
m4_define(elpa_m4_kernel_types, [generic sparc64 vsx sse sse_assembly avx avx2 avx512 bgp bgq gpu]) m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 bgp bgq gpu])
m4_define(elpa_m4_all_kernels, m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type], m4_foreach_w([elpa_m4_type],
...@@ -715,6 +721,7 @@ AC_DEFUN([ELPA_SELECT_KERNELS], [ ...@@ -715,6 +721,7 @@ AC_DEFUN([ELPA_SELECT_KERNELS], [
dnl Modify list of kernels with configure arguments dnl Modify list of kernels with configure arguments
ELPA_SELECT_KERNELS([generic],[enable]) ELPA_SELECT_KERNELS([generic],[enable])
ELPA_SELECT_KERNELS([sparc64],[disable]) ELPA_SELECT_KERNELS([sparc64],[disable])
ELPA_SELECT_KERNELS([neon_arch64],[disable])
ELPA_SELECT_KERNELS([vsx],[disable]) ELPA_SELECT_KERNELS([vsx],[disable])
ELPA_SELECT_KERNELS([sse],[enable]) ELPA_SELECT_KERNELS([sse],[enable])
ELPA_SELECT_KERNELS([sse_assembly],[enable]) ELPA_SELECT_KERNELS([sse_assembly],[enable])
...@@ -730,7 +737,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[ ...@@ -730,7 +737,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
]) ])
if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [ m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [
if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then
echo "Disabling elpa_m4_kernel due to BGP/BGQ option" echo "Disabling elpa_m4_kernel due to BGP/BGQ option"
fi fi
...@@ -790,7 +797,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[ ...@@ -790,7 +797,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[
]) ])
fi fi
]) ])
m4_foreach_w([elpa_m4_arch],[sparc64 vsx sse avx avx2 avx512],[ m4_foreach_w([elpa_m4_arch],[sparc64 neon_arch64 vsx sse avx avx2 avx512],[
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2]) ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2]) ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1]) ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1])
...@@ -848,7 +855,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[ ...@@ -848,7 +855,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel], m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel], m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels, elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ), [m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[ [
if test -z "$default_[]elpa_m4_kind[]_kernel"; then if test -z "$default_[]elpa_m4_kind[]_kernel"; then
...@@ -895,7 +902,6 @@ int main(int argc, char **argv) { ...@@ -895,7 +902,6 @@ int main(int argc, char **argv) {
AC_DEFINE([HAVE_VSX_SSE],[1],[Altivec VSX intrinsics are supported on this CPU]) AC_DEFINE([HAVE_VSX_SSE],[1],[Altivec VSX intrinsics are supported on this CPU])
fi fi
if test x"${need_sparc64}" = x"yes"; then if test x"${need_sparc64}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile SPARC64 with intrinsics in C) AC_MSG_CHECKING(whether we can compile SPARC64 with intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([ AC_COMPILE_IFELSE([AC_LANG_SOURCE([
...@@ -917,6 +923,27 @@ int main(int argc, char **argv) { ...@@ -917,6 +923,27 @@ int main(int argc, char **argv) {
AC_DEFINE([HAVE_SPARC64_SSE],[1],[SPARC64 intrinsics are supported on this CPU]) AC_DEFINE([HAVE_SPARC64_SSE],[1],[SPARC64 intrinsics are supported on this CPU])
fi fi
if test x"${need_neon_arch64}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile NEON ARCH64 with intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <arm_neon.h>
int main(int argc, char **argv) {
__Float64x2_t x1, x2, x3, x4;
x4 = vfmaq_64(x1, x2, x3);
return 0;
}
])],
[can_compile_neon_arch64=yes],
[can_compile_neon_arch64=no]
)
AC_MSG_RESULT([${can_compile_neon_arch64}])
if test x"$can_compile_neon_arch64" != x"yes"; then
AC_MSG_ERROR([Could not compile test program, try with --disable-neon_arch64, or adjust the C compiler or CFLAGS])
fi
AC_DEFINE([HAVE_NEON_ARCH64_SSE],[1],[NEON_ARCH64 intrinsics are supported on this CPU])
fi
if test x"${need_sse}" = x"yes"; then if test x"${need_sse}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile SSE3 with gcc intrinsics in C) AC_MSG_CHECKING(whether we can compile SSE3 with gcc intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([ AC_COMPILE_IFELSE([AC_LANG_SOURCE([
......
...@@ -44,10 +44,13 @@ enum ELPA_SOLVERS { ...@@ -44,10 +44,13 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 22, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2, 22, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 23, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4, 23, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 24, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6, 24, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 25, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) X(ELPA_2STAGE_REAL_VSX_BLOCK2, 25, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 26, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 27, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 28, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \ #define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \ ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
......
...@@ -339,6 +339,7 @@ ...@@ -339,6 +339,7 @@
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. & kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2 .or. & kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. & kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. & kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_GENERIC .or. & kernel .eq. ELPA_2STAGE_REAL_GENERIC .or. &
kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. & kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. &
...@@ -850,6 +851,43 @@ ...@@ -850,6 +851,43 @@
#endif /* REALCASE == 1 */ #endif /* REALCASE == 1 */
#if REALCASE == 1
! implementation of neon_arch64 block 2 real case
#if defined(WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL))
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_2hv_&
&PRECISION &
& (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_2hv_&
&PRECISION &
& (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL */
#endif /* REALCASE == 1 */
#if REALCASE == 1 #if REALCASE == 1
! implementation of vsx block 2 real case ! implementation of vsx block 2 real case
...@@ -1656,6 +1694,77 @@ ...@@ -1656,6 +1694,77 @@
#endif /* REALCASE */ #endif /* REALCASE */
#if REALCASE == 1
! neon_arch64 block 4 real kernel
#if defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL))
do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
#ifdef WITH_OPENMP
call quad_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_4hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_4hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
do jj = j, 2, -2
w(:,1) = bcast_buffer(1:nbw,jj+off)
w(:,2) = bcast_buffer(1:nbw,jj+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_2hv_&
&PRECISION&
& (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_2hv_&
&PRECISION&
& (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP
if (jj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL */
#endif /* REALCASE */
#if REALCASE == 1 #if REALCASE == 1
! vsx block4 real kernel ! vsx block4 real kernel
...@@ -2040,6 +2149,94 @@ ...@@ -2040,6 +2149,94 @@
#endif /* REALCASE */ #endif /* REALCASE */
#if REALCASE == 1
!neon_arch64 block6 real kernel
#if defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6) then
#endif /* not WITH_FIXED_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
do j = ncols, 6, -6
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
w(:,5) = bcast_buffer(1:nbw,j+off-4)
w(:,6) = bcast_buffer(1:nbw,j+off-5)
#ifdef WITH_OPENMP
call hexa_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_6hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call hexa_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_6hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
do jj = j, 4, -4
w(:,1) = bcast_buffer(1:nbw,jj+off)
w(:,2) = bcast_buffer(1:nbw,jj+off-1)
w(:,3) = bcast_buffer(1:nbw,jj+off-2)
w(:,4) = bcast_buffer(1:nbw,jj+off-3)
#ifdef WITH_OPENMP
call quad_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_4hv_&
&PRECISION&
& (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_&
&MATH_DATATYPE&
&neon_arch64_4hv_&
&PRECISION&
& (c_loc(a(1,jj+off+a_off-3,istripe)), w, &
nbw, nl, stripe_width, nbw)
#endif
enddo
do jjj = jj, 2, -2
w(:,1) = bcast_buffer(1:nbw,jjj+off)
w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_2hv_&
&PRECISION&
& (c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_neon_arch64_2hv_&
&PRECISION&
& (c_loc(a(1,jjj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP
if (jjj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jjj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL */
#endif /* REALCASE */
#if REALCASE == 1 #if REALCASE == 1
!vsx block6 real kernel !vsx block6 real kernel
#if defined(WITH_REAL_VSX_BLOCK6_KERNEL) #if defined(WITH_REAL_VSX_BLOCK6_KERNEL)
......
...@@ -65,17 +65,28 @@ ...@@ -65,17 +65,28 @@
#define CONCAT_3ARGS(a, b, c) CONCAT2_3ARGS(a, b, c) #define CONCAT_3ARGS(a, b, c) CONCAT2_3ARGS(a, b, c)
#define CONCAT2_3ARGS(a, b, c) a ## b ## c #define CONCAT2_3ARGS(a, b, c) a ## b ## c
//define instruction set numbers
#define NEON_ARCH64_128 1285
#if VEC_SET == 128 || VEC_SET == 256 || VEC_SET == 512 #if VEC_SET == 128 || VEC_SET == 256 || VEC_SET == 512
#include <x86intrin.h> #include <x86intrin.h>
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
#include <fjmfunc.h> #include <fjmfunc.h>
#include <emmintrin.h> #include <emmintrin.h>
#endif #endif
#if VEC_SET == 1282 #if VEC_SET == 1282
#include <altivec.h> #include <altivec.h>
#endif #endif
#if VEC_SET == NEON_ARCH64_128
#include <arm_neon.h>
#endif
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
...@@ -106,6 +117,10 @@ ...@@ -106,6 +117,10 @@
#define SIMD_SET VSX #define SIMD_SET VSX
#endif #endif
#if VEC_SET == NEON_ARCH64_128
#define SIMD_SET NEON_ARCH64
#endif
#if VEC_SET == 256 #if VEC_SET == 256
#define SIMD_SET AVX_AVX2 #define SIMD_SET AVX_AVX2
#endif #endif
...@@ -129,6 +144,10 @@ ...@@ -129,6 +144,10 @@
#if VEC_SET == 128 #if VEC_SET == 128
#define _SIMD_SET _mm_set_pd #define _SIMD_SET _mm_set_pd
#define _SIMD_SET1 _mm_set1_pd #define _SIMD_SET1 _mm_set1_pd
#define _SIMD_NEG 1
#endif
#if VEC_SET == 1281
#define _SIMD_NEG _fjsp_neg_v2r8
#endif #endif
#endif /* DOUBLE_PRECISION_REAL */ #endif /* DOUBLE_PRECISION_REAL */
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
...@@ -143,7 +162,11 @@ ...@@ -143,7 +162,11 @@
#if VEC_SET == 128 #if VEC_SET == 128
#define _SIMD_SET _mm_set_ps #define _SIMD_SET _mm_set_ps
#define _SIMD_SET1 _mm_set1_ps #define _SIMD_SET1 _mm_set1_ps
#define _SIMD_NEG 1
#endif #endif
#if VEC_SET == 1281
#define _SIMD_NEG 1
#endif
#endif /* SINGLE_PRECISION_REAL */ #endif /* SINGLE_PRECISION_REAL */
#endif /* VEC_SET == 128 || VEC_SET == 1281 */ #endif /* VEC_SET == 128 || VEC_SET == 1281 */
...@@ -161,6 +184,7 @@ ...@@ -161,6 +184,7 @@
#define _SIMD_LOAD (__vector float) vec_ld #define _SIMD_LOAD (__vector float) vec_ld
#endif #endif
#define _SIMD_NEG 1
#define _SIMD_STORE vec_st #define _SIMD_STORE vec_st
#define _SIMD_ADD vec_add #define _SIMD_ADD vec_add
#define _SIMD_MUL vec_mul #define _SIMD_MUL vec_mul
...@@ -168,6 +192,35 @@ ...@@ -168,6 +192,35 @@
#endif /* VEC_SET == 1281 */ #endif /* VEC_SET == 1281 */
#if VEC_SET == NEON_ARCH64_128
#ifdef DOUBLE_PRECISION_REAL