Commit fa3e9892 authored by Andreas Marek's avatar Andreas Marek

Real block2 double-precision kernel for Power8-Computer

parent ab06e091
...@@ -209,6 +209,13 @@ if WANT_SINGLE_PRECISION_REAL ...@@ -209,6 +209,13 @@ if WANT_SINGLE_PRECISION_REAL
endif endif
endif endif
if WITH_REAL_VSX_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_2hv_single_precision.c
endif
endif
if WITH_REAL_SSE_BLOCK2_KERNEL if WITH_REAL_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_2hv_double_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL if WANT_SINGLE_PRECISION_REAL
...@@ -237,13 +244,19 @@ if WANT_SINGLE_PRECISION_REAL ...@@ -237,13 +244,19 @@ if WANT_SINGLE_PRECISION_REAL
endif endif
endif endif
#if WITH_REAL_SPARC64_BLOCK4_KERNEL
if WITH_REAL_SPARC64_BLOCK4_KERNEL # libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_double_precision.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_double_precision.c #if WANT_SINGLE_PRECISION_REAL
if WANT_SINGLE_PRECISION_REAL # libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_single_precision.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_single_precision.c #endif
endif #endif
endif #
#if WITH_REAL_VSX_BLOCK4_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_single_precision.c
#endif
#endif
if WITH_REAL_SSE_BLOCK4_KERNEL if WITH_REAL_SSE_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_4hv_double_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_4hv_double_precision.c
...@@ -273,13 +286,19 @@ if WANT_SINGLE_PRECISION_REAL ...@@ -273,13 +286,19 @@ if WANT_SINGLE_PRECISION_REAL
endif endif
endif endif
if WITH_REAL_SPARC64_BLOCK6_KERNEL #if WITH_REAL_SPARC64_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_double_precision.c # libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL #if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_single_precision.c # libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_single_precision.c
endif #endif
endif #endif
#
#if WITH_REAL_VSX_BLOCK6_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_6hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_6hv_single_precision.c
#endif
#endif
if WITH_REAL_SSE_BLOCK6_KERNEL if WITH_REAL_SSE_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_6hv_double_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_6hv_double_precision.c
...@@ -309,13 +328,19 @@ if WANT_SINGLE_PRECISION_REAL ...@@ -309,13 +328,19 @@ if WANT_SINGLE_PRECISION_REAL
endif endif
endif endif
#if WITH_COMPLEX_SPARC64_BLOCK1_KERNEL
if WITH_COMPLEX_SPARC64_BLOCK1_KERNEL # libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_1hv_double_precision.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_1hv_double_precision.c #if WANT_SINGLE_PRECISION_COMPLEX
if WANT_SINGLE_PRECISION_COMPLEX # libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_1hv_single_precision.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_1hv_single_precision.c #endif
endif #endif
endif #
#if WITH_COMPLEX_VSX_BLOCK1_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_1hv_double_precision.c
#if WANT_SINGLE_PRECISION_COMPLEX
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_1hv_single_precision.c
#endif
#endif
if WITH_COMPLEX_SSE_BLOCK1_KERNEL if WITH_COMPLEX_SSE_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sse_1hv_double_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sse_1hv_double_precision.c
...@@ -346,12 +371,19 @@ if WANT_SINGLE_PRECISION_COMPLEX ...@@ -346,12 +371,19 @@ if WANT_SINGLE_PRECISION_COMPLEX
endif endif
endif endif
if WITH_COMPLEX_SPARC64_BLOCK2_KERNEL #if WITH_COMPLEX_SPARC64_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_2hv_double_precision.c # libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_2hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX #if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_2hv_single_precision.c # libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_2hv_single_precision.c
endif #endif
endif #endif
#
#if WITH_COMPLEX_VSX_BLOCK2_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_2hv_double_precision.c
#if WANT_SINGLE_PRECISION_COMPLEX
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_2hv_single_precision.c
#endif
#endif
if WITH_COMPLEX_SSE_BLOCK2_KERNEL if WITH_COMPLEX_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sse_2hv_double_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sse_2hv_double_precision.c
......
...@@ -474,6 +474,14 @@ m4_define(elpa_m4_sparc64_kernels, [ ...@@ -474,6 +474,14 @@ m4_define(elpa_m4_sparc64_kernels, [
complex_sparc64_block2 complex_sparc64_block2
]) ])
m4_define(elpa_m4_vsx_kernels, [
real_vsx_block2
real_vsx_block4
real_vsx_block6
complex_vsx_block1
complex_vsx_block2
])
m4_define(elpa_m4_avx_kernels, [ m4_define(elpa_m4_avx_kernels, [
real_avx_block2 real_avx_block2
real_avx_block4 real_avx_block4
...@@ -513,7 +521,7 @@ m4_define(elpa_m4_gpu_kernels, [ ...@@ -513,7 +521,7 @@ m4_define(elpa_m4_gpu_kernels, [
complex_gpu complex_gpu
]) ])
m4_define(elpa_m4_kernel_types, [generic sparc64 sse sse_assembly avx avx2 avx512 bgp bgq gpu]) m4_define(elpa_m4_kernel_types, [generic sparc64 vsx sse sse_assembly avx avx2 avx512 bgp bgq gpu])
m4_define(elpa_m4_all_kernels, m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type], m4_foreach_w([elpa_m4_type],
...@@ -547,6 +555,7 @@ AC_DEFUN([ELPA_SELECT_KERNELS], [ ...@@ -547,6 +555,7 @@ AC_DEFUN([ELPA_SELECT_KERNELS], [
dnl Modify list of kernels with configure arguments dnl Modify list of kernels with configure arguments
ELPA_SELECT_KERNELS([generic],[enable]) ELPA_SELECT_KERNELS([generic],[enable])
ELPA_SELECT_KERNELS([sparc64],[disable]) ELPA_SELECT_KERNELS([sparc64],[disable])
ELPA_SELECT_KERNELS([vsx],[disable])
ELPA_SELECT_KERNELS([sse],[enable]) ELPA_SELECT_KERNELS([sse],[enable])
ELPA_SELECT_KERNELS([sse_assembly],[enable]) ELPA_SELECT_KERNELS([sse_assembly],[enable])
ELPA_SELECT_KERNELS([avx],[enable]) ELPA_SELECT_KERNELS([avx],[enable])
...@@ -561,7 +570,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[ ...@@ -561,7 +570,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
]) ])
if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [ m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [
if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then
echo "Disabling elpa_m4_kernel due to BGP/BGQ option" echo "Disabling elpa_m4_kernel due to BGP/BGQ option"
fi fi
...@@ -621,7 +630,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[ ...@@ -621,7 +630,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[
]) ])
fi fi
]) ])
m4_foreach_w([elpa_m4_arch],[sparc64 sse avx avx2 avx512],[ m4_foreach_w([elpa_m4_arch],[sparc64 vsx sse avx avx2 avx512],[
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2]) ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2]) ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1]) ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1])
...@@ -655,7 +664,7 @@ dnl choosing a default kernel ...@@ -655,7 +664,7 @@ dnl choosing a default kernel
m4_foreach_w([elpa_m4_kind],[real complex],[ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel], m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel], m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_generic_kernels, elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ), [m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[ [
if test -z "$default_[]elpa_m4_kind[]_kernel"; then if test -z "$default_[]elpa_m4_kind[]_kernel"; then
...@@ -681,6 +690,28 @@ dnl __m128d h1 = _fjsp_neg_v2r8(q); ...@@ -681,6 +690,28 @@ dnl __m128d h1 = _fjsp_neg_v2r8(q);
dnl return 0; dnl return 0;
dnl } dnl }
AC_LANG_PUSH([C]) AC_LANG_PUSH([C])
if test x"${need_vsx}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile Altivec VSX with intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <altivec.h>
int main(int argc, char **argv) {
__vector double a, b, c;
c = vec_add(a,b);
return 0;
}
])],
[can_compile_vsx=yes],
[can_compile_vsx=no]
)
AC_MSG_RESULT([${can_compile_vsx}])
if test x"$can_compile_vsx" != x"yes"; then
AC_MSG_ERROR([Could not compile test program, try with --disable-vsx, or adjust the C compiler or CFLAGS])
fi
AC_DEFINE([HAVE_VSX_SSE],[1],[Altivec VSX intrinsics are supported on this CPU])
fi
if test x"${need_sparc64}" = x"yes"; then if test x"${need_sparc64}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile SPARC64 with intrinsics in C) AC_MSG_CHECKING(whether we can compile SPARC64 with intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([ AC_COMPILE_IFELSE([AC_LANG_SOURCE([
......
...@@ -43,7 +43,10 @@ enum ELPA_SOLVERS { ...@@ -43,7 +43,10 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_GPU, 18, @ELPA_2STAGE_REAL_GPU_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_GPU, 18, @ELPA_2STAGE_REAL_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 22, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 23, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 24, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \ #define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \ ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
...@@ -69,7 +72,11 @@ enum ELPA_REAL_KERNELS { ...@@ -69,7 +72,11 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2, 11, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2, 11, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1, 12, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1, 12, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2, 13, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2, 13, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__) X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1, 15, @ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2, 16, @ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_VSX_BLOCK1, 17, @ELPA_2STAGE_COMPLEX_VSX_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_VSX_BLOCK2, 18, @ELPA_2STAGE_COMPLEX_VSX_BLOCK2_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \ #define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \ ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
......
...@@ -329,6 +329,7 @@ ...@@ -329,6 +329,7 @@
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. & kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2 .or. & kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. & kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_GENERIC .or. & kernel .eq. ELPA_2STAGE_REAL_GENERIC .or. &
kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. & kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. &
kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY .or. & kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY .or. &
...@@ -620,7 +621,7 @@ ...@@ -620,7 +621,7 @@
#endif /* COMPLEXCASE */ #endif /* COMPLEXCASE */
#if REALCASE == 1 #if REALCASE == 1
! no sse block1 real kernel ! no sse, vsx, sparc64 block1 real kernel
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
...@@ -658,6 +659,41 @@ ...@@ -658,6 +659,41 @@
#endif /* COMPLEXCASE */ #endif /* COMPLEXCASE */
#if COMPLEXCASE == 1
! vsx block1 complex kernel
#if defined(WITH_COMPLEX_VSX_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL))
ttt = mpi_wtime()
do j = ncols, 1, -1
#ifdef WITH_OPENMP
call single_hh_trafo_&
&MATH_DATATYPE&
&_vsx_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_&
&MATH_DATATYPE&
&_vsx_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)) */
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_VSX_BLOCK1_KERNEL */
#endif /* COMPLEXCASE */
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
! sse block1 complex kernel ! sse block1 complex kernel
...@@ -803,39 +839,41 @@ ...@@ -803,39 +839,41 @@
#endif /* WITH_REAL_SPARC64_BLOCK2_KERNEL */ #endif /* WITH_REAL_SPARC64_BLOCK2_KERNEL */
#endif /* REALCASE == 1 */ #endif /* REALCASE == 1 */
#if REALCASE == 1 #if REALCASE == 1
! implementation of sparc64 block 2 real case ! implementation of vsx block 2 real case
#if defined(WITH_REAL_SPARC64_BLOCK2_KERNEL) #if defined(WITH_REAL_VSX_BLOCK2_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL #ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2) then if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2) then
#endif /* not WITH_FIXED_REAL_KERNEL */ #endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL)) #if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL) && !defined(WITH_REAL_VSX_BLOCK4_KERNEL))
do j = ncols, 2, -2 do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off) w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1) w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
call double_hh_trafo_& call double_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_sse_2hv_& &_vsx_2hv_&
&PRECISION & &PRECISION &
& (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw) & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else #else
call double_hh_trafo_& call double_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_sse_2hv_& &_vsx_2hv_&
&PRECISION & &PRECISION &
& (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw) & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif #endif
enddo enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL)) */ #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL) && !defined(WITH_REAL_VSX_BLOCK4_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL #ifndef WITH_FIXED_REAL_KERNEL
endif endif
#endif /* not WITH_FIXED_REAL_KERNEL */ #endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SSE_BLOCK2_KERNEL */ #endif /* WITH_REAL_VSX_BLOCK2_KERNEL */
#endif /* REALCASE == 1 */ #endif /* REALCASE == 1 */
...@@ -885,6 +923,53 @@ ...@@ -885,6 +923,53 @@
#endif /* WITH_COMPLEX_SPARC64_BLOCK2_KERNEL */ #endif /* WITH_COMPLEX_SPARC64_BLOCK2_KERNEL */
#endif /* COMPLEXCASE == 1 */ #endif /* COMPLEXCASE == 1 */
#if COMPLEXCASE == 1
! implementation of vsx block 2 complex case
#if defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK2) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_&
&MATH_DATATYPE&
&_vsx_2hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_vsx_2hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP
if (j==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_vsx_1hv_&
&PRECISION&
& (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
if (j==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_vsx_1hv_&
&PRECISION&
& (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK2)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_VSX_BLOCK2_KERNEL */
#endif /* COMPLEXCASE == 1 */
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
! implementation of sse block 2 complex case ! implementation of sse block 2 complex case
...@@ -1244,7 +1329,7 @@ ...@@ -1244,7 +1329,7 @@
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif #endif
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL)) */ #endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL #ifndef WITH_FIXED_REAL_KERNEL
endif endif
...@@ -1253,6 +1338,77 @@ ...@@ -1253,6 +1338,77 @@
#endif /* REALCASE */ #endif /* REALCASE */
#if REALCASE == 1
! vsx block4 real kernel
#if defined(WITH_REAL_VSX_BLOCK4_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK4) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL))
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
#ifdef WITH_OPENMP
call quad_hh_trafo_&
&MATH_DATATYPE&
&_vsx_4hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_&
&MATH_DATATYPE&
&_vsx_4hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
do jj = j, 2, -2
w(:,1) = bcast_buffer(1:nbw,jj+off)
w(:,2) = bcast_buffer(1:nbw,jj+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_&
&MATH_DATATYPE&
&_vsx_2hv_&
&PRECISION&
& (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_vsx_2hv_&
&PRECISION&
& (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP
if (jj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_VSX_BLOCK4_KERNEL */
#endif /* REALCASE */
#if REALCASE == 1 #if REALCASE == 1
! sse block4 real kernel ! sse block4 real kernel
...@@ -1478,6 +1634,7 @@ ...@@ -1478,6 +1634,7 @@
!no avx512 block4 complex kernel !no avx512 block4 complex kernel
#endif /* COMPLEXCASE */ #endif /* COMPLEXCASE */
#if REALCASE == 1 #if REALCASE == 1
!sparc64 block6 real kernel !sparc64 block6 real kernel
#if defined(WITH_REAL_SPARC64_BLOCK6_KERNEL) #if defined(WITH_REAL_SPARC64_BLOCK6_KERNEL)
...@@ -1515,7 +1672,7 @@ ...@@ -1515,7 +1672,7 @@
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
call quad_hh_trafo_& call quad_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_sparc64__4hv_& &_sparc64_4hv_&
&PRECISION& &PRECISION&
& (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw) & (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else #else
...@@ -1565,6 +1722,93 @@ ...@@ -1565,6 +1722,93 @@
#endif /* REALCASE */ #endif /* REALCASE */
#if REALCASE == 1
!vsx block6 real kernel
#if defined(WITH_REAL_VSX_BLOCK6_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK6) then
#endif /* not WITH_FIXED_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
do j = ncols, 6, -6
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
w(:,5) = bcast_buffer(1:nbw,j+off-4)
w(:,6) = bcast_buffer(1:nbw,j+off-5)
#ifdef WITH_OPENMP
call hexa_hh_trafo_&
&MATH_DATATYPE&
&_vsx_6hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call hexa_hh_trafo_&
&MATH_DATATYPE&
&_vsx_6hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
enddo
do jj = j, 4, -4