Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Security note for users of the third-party Codecov tool:
https://about.codecov.io/security-update/
Open sidebar
elpa
elpa
Commits
fa3e9892
Commit
fa3e9892
authored
Nov 18, 2017
by
Andreas Marek
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Real block2 double-precision kernel for Power8-Computer
parent
ab06e091
Changes
6
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
2359 additions
and
44 deletions
+2359
-44
Makefile.am
Makefile.am
+59
-27
configure.ac
configure.ac
+35
-4
elpa/elpa_constants.h.in
elpa/elpa_constants.h.in
+9
-2
src/elpa2/compute_hh_trafo.F90
src/elpa2/compute_hh_trafo.F90
+255
-11
src/elpa2/kernels/real_vsx_2hv_double_precision.c
src/elpa2/kernels/real_vsx_2hv_double_precision.c
+55
-0
src/elpa2/kernels/real_vsx_2hv_template.c
src/elpa2/kernels/real_vsx_2hv_template.c
+1946
-0
No files found.
Makefile.am
View file @
fa3e9892
...
@@ -209,6 +209,13 @@ if WANT_SINGLE_PRECISION_REAL
...
@@ -209,6 +209,13 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
endif
endif
if
WITH_REAL_VSX_BLOCK2_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_vsx_2hv_double_precision.c
if
WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_vsx_2hv_single_precision.c
endif
endif
if
WITH_REAL_SSE_BLOCK2_KERNEL
if
WITH_REAL_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_sse_2hv_double_precision.c
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_sse_2hv_double_precision.c
if
WANT_SINGLE_PRECISION_REAL
if
WANT_SINGLE_PRECISION_REAL
...
@@ -237,13 +244,19 @@ if WANT_SINGLE_PRECISION_REAL
...
@@ -237,13 +244,19 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
endif
endif
#if WITH_REAL_SPARC64_BLOCK4_KERNEL
if
WITH_REAL_SPARC64_BLOCK4_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_double_precision.c
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_sparc64_4hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
if
WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_single_precision.c
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_sparc64_4hv_single_precision.c
#endif
endif
#endif
endif
#
#if WITH_REAL_VSX_BLOCK4_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_single_precision.c
#endif
#endif
if
WITH_REAL_SSE_BLOCK4_KERNEL
if
WITH_REAL_SSE_BLOCK4_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_sse_4hv_double_precision.c
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_sse_4hv_double_precision.c
...
@@ -273,13 +286,19 @@ if WANT_SINGLE_PRECISION_REAL
...
@@ -273,13 +286,19 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
endif
endif
if
WITH_REAL_SPARC64_BLOCK6_KERNEL
#if WITH_REAL_SPARC64_BLOCK6_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_sparc64_6hv_double_precision.c
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_double_precision.c
if
WANT_SINGLE_PRECISION_REAL
#if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_sparc64_6hv_single_precision.c
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_single_precision.c
endif
#endif
endif
#endif
#
#if WITH_REAL_VSX_BLOCK6_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_6hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_6hv_single_precision.c
#endif
#endif
if
WITH_REAL_SSE_BLOCK6_KERNEL
if
WITH_REAL_SSE_BLOCK6_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_sse_6hv_double_precision.c
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_sse_6hv_double_precision.c
...
@@ -309,13 +328,19 @@ if WANT_SINGLE_PRECISION_REAL
...
@@ -309,13 +328,19 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
endif
endif
#if WITH_COMPLEX_SPARC64_BLOCK1_KERNEL
if
WITH_COMPLEX_SPARC64_BLOCK1_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_1hv_double_precision.c
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/complex_sparc64_1hv_double_precision.c
#if WANT_SINGLE_PRECISION_COMPLEX
if
WANT_SINGLE_PRECISION_COMPLEX
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_1hv_single_precision.c
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/complex_sparc64_1hv_single_precision.c
#endif
endif
#endif
endif
#
#if WITH_COMPLEX_VSX_BLOCK1_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_1hv_double_precision.c
#if WANT_SINGLE_PRECISION_COMPLEX
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_1hv_single_precision.c
#endif
#endif
if
WITH_COMPLEX_SSE_BLOCK1_KERNEL
if
WITH_COMPLEX_SSE_BLOCK1_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/complex_sse_1hv_double_precision.c
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/complex_sse_1hv_double_precision.c
...
@@ -346,12 +371,19 @@ if WANT_SINGLE_PRECISION_COMPLEX
...
@@ -346,12 +371,19 @@ if WANT_SINGLE_PRECISION_COMPLEX
endif
endif
endif
endif
if
WITH_COMPLEX_SPARC64_BLOCK2_KERNEL
#if WITH_COMPLEX_SPARC64_BLOCK2_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/complex_sparc64_2hv_double_precision.c
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_2hv_double_precision.c
if
WANT_SINGLE_PRECISION_COMPLEX
#if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/complex_sparc64_2hv_single_precision.c
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_2hv_single_precision.c
endif
#endif
endif
#endif
#
#if WITH_COMPLEX_VSX_BLOCK2_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_2hv_double_precision.c
#if WANT_SINGLE_PRECISION_COMPLEX
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_2hv_single_precision.c
#endif
#endif
if
WITH_COMPLEX_SSE_BLOCK2_KERNEL
if
WITH_COMPLEX_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/complex_sse_2hv_double_precision.c
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/complex_sse_2hv_double_precision.c
...
...
configure.ac
View file @
fa3e9892
...
@@ -474,6 +474,14 @@ m4_define(elpa_m4_sparc64_kernels, [
...
@@ -474,6 +474,14 @@ m4_define(elpa_m4_sparc64_kernels, [
complex_sparc64_block2
complex_sparc64_block2
])
])
m4_define(elpa_m4_vsx_kernels, [
real_vsx_block2
real_vsx_block4
real_vsx_block6
complex_vsx_block1
complex_vsx_block2
])
m4_define(elpa_m4_avx_kernels, [
m4_define(elpa_m4_avx_kernels, [
real_avx_block2
real_avx_block2
real_avx_block4
real_avx_block4
...
@@ -513,7 +521,7 @@ m4_define(elpa_m4_gpu_kernels, [
...
@@ -513,7 +521,7 @@ m4_define(elpa_m4_gpu_kernels, [
complex_gpu
complex_gpu
])
])
m4_define(elpa_m4_kernel_types, [generic sparc64 sse sse_assembly avx avx2 avx512 bgp bgq gpu])
m4_define(elpa_m4_kernel_types, [generic sparc64
vsx
sse sse_assembly avx avx2 avx512 bgp bgq gpu])
m4_define(elpa_m4_all_kernels,
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
m4_foreach_w([elpa_m4_type],
...
@@ -547,6 +555,7 @@ AC_DEFUN([ELPA_SELECT_KERNELS], [
...
@@ -547,6 +555,7 @@ AC_DEFUN([ELPA_SELECT_KERNELS], [
dnl Modify list of kernels with configure arguments
dnl Modify list of kernels with configure arguments
ELPA_SELECT_KERNELS([generic],[enable])
ELPA_SELECT_KERNELS([generic],[enable])
ELPA_SELECT_KERNELS([sparc64],[disable])
ELPA_SELECT_KERNELS([sparc64],[disable])
ELPA_SELECT_KERNELS([vsx],[disable])
ELPA_SELECT_KERNELS([sse],[enable])
ELPA_SELECT_KERNELS([sse],[enable])
ELPA_SELECT_KERNELS([sse_assembly],[enable])
ELPA_SELECT_KERNELS([sse_assembly],[enable])
ELPA_SELECT_KERNELS([avx],[enable])
ELPA_SELECT_KERNELS([avx],[enable])
...
@@ -561,7 +570,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
...
@@ -561,7 +570,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
])
])
if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then
if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels
elpa_m4_vsx_kernels
elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [
if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then
if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then
echo "Disabling elpa_m4_kernel due to BGP/BGQ option"
echo "Disabling elpa_m4_kernel due to BGP/BGQ option"
fi
fi
...
@@ -621,7 +630,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[
...
@@ -621,7 +630,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[
])
])
fi
fi
])
])
m4_foreach_w([elpa_m4_arch],[sparc64 sse avx avx2 avx512],[
m4_foreach_w([elpa_m4_arch],[sparc64
vsx
sse avx avx2 avx512],[
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1])
ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1])
...
@@ -655,7 +664,7 @@ dnl choosing a default kernel
...
@@ -655,7 +664,7 @@ dnl choosing a default kernel
m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel],
m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_generic_kernels,
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels
elpa_m4_vsx_kernels
elpa_m4_generic_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[
[
if test -z "$default_[]elpa_m4_kind[]_kernel"; then
if test -z "$default_[]elpa_m4_kind[]_kernel"; then
...
@@ -681,6 +690,28 @@ dnl __m128d h1 = _fjsp_neg_v2r8(q);
...
@@ -681,6 +690,28 @@ dnl __m128d h1 = _fjsp_neg_v2r8(q);
dnl return 0;
dnl return 0;
dnl }
dnl }
AC_LANG_PUSH([C])
AC_LANG_PUSH([C])
if test x"${need_vsx}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile Altivec VSX with intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <altivec.h>
int main(int argc, char **argv) {
__vector double a, b, c;
c = vec_add(a,b);
return 0;
}
])],
[can_compile_vsx=yes],
[can_compile_vsx=no]
)
AC_MSG_RESULT([${can_compile_vsx}])
if test x"$can_compile_vsx" != x"yes"; then
AC_MSG_ERROR([Could not compile test program, try with --disable-vsx, or adjust the C compiler or CFLAGS])
fi
AC_DEFINE([HAVE_VSX_SSE],[1],[Altivec VSX intrinsics are supported on this CPU])
fi
if test x"${need_sparc64}" = x"yes"; then
if test x"${need_sparc64}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile SPARC64 with intrinsics in C)
AC_MSG_CHECKING(whether we can compile SPARC64 with intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
...
...
elpa/elpa_constants.h.in
View file @
fa3e9892
...
@@ -43,7 +43,10 @@ enum ELPA_SOLVERS {
...
@@ -43,7 +43,10 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_GPU, 18, @ELPA_2STAGE_REAL_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GPU, 18, @ELPA_2STAGE_REAL_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 22, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 23, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 24, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
...
@@ -69,7 +72,11 @@ enum ELPA_REAL_KERNELS {
...
@@ -69,7 +72,11 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2, 11, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2, 11, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1, 12, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1, 12, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2, 13, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2, 13, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1, 15, @ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2, 16, @ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_VSX_BLOCK1, 17, @ELPA_2STAGE_COMPLEX_VSX_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_VSX_BLOCK2, 18, @ELPA_2STAGE_COMPLEX_VSX_BLOCK2_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
...
...
src/elpa2/compute_hh_trafo.F90
View file @
fa3e9892
...
@@ -329,6 +329,7 @@
...
@@ -329,6 +329,7 @@
kernel
.eq.
ELPA_2STAGE_REAL_AVX512_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_AVX512_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SSE_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SSE_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SPARC64_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SPARC64_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_VSX_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_GENERIC
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_GENERIC
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_GENERIC_SIMPLE
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_GENERIC_SIMPLE
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SSE_ASSEMBLY
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SSE_ASSEMBLY
.or.
&
...
@@ -620,7 +621,7 @@
...
@@ -620,7 +621,7 @@
#endif /* COMPLEXCASE */
#endif /* COMPLEXCASE */
#if REALCASE == 1
#if REALCASE == 1
! no sse block1 real kernel
! no sse
, vsx, sparc64
block1 real kernel
#endif
#endif
#if COMPLEXCASE == 1
#if COMPLEXCASE == 1
...
@@ -658,6 +659,41 @@
...
@@ -658,6 +659,41 @@
#endif /* COMPLEXCASE */
#endif /* COMPLEXCASE */
#if COMPLEXCASE == 1
! vsx block1 complex kernel
#if defined(WITH_COMPLEX_VSX_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_VSX_BLOCK1
)
then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL))
ttt
=
mpi_wtime
()
do
j
=
ncols
,
1
,
-1
#ifdef WITH_OPENMP
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
,
my_thread
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#else
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#endif
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)) */
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif
! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_VSX_BLOCK1_KERNEL */
#endif /* COMPLEXCASE */
#if COMPLEXCASE == 1
#if COMPLEXCASE == 1
! sse block1 complex kernel
! sse block1 complex kernel
...
@@ -803,39 +839,41 @@
...
@@ -803,39 +839,41 @@
#endif /* WITH_REAL_SPARC64_BLOCK2_KERNEL */
#endif /* WITH_REAL_SPARC64_BLOCK2_KERNEL */
#endif /* REALCASE == 1 */
#endif /* REALCASE == 1 */
#if REALCASE == 1
#if REALCASE == 1
! implementation of
sparc64
block 2 real case
! implementation of
vsx
block 2 real case
#if defined(WITH_REAL_
SPARC64
_BLOCK2_KERNEL)
#if defined(WITH_REAL_
VSX
_BLOCK2_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
#ifndef WITH_FIXED_REAL_KERNEL
if
(
kernel
.eq.
ELPA_2STAGE_REAL_
SSE
_BLOCK2
)
then
if
(
kernel
.eq.
ELPA_2STAGE_REAL_
VSX
_BLOCK2
)
then
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_
SSE
_BLOCK6_KERNEL) && !defined(WITH_REAL_
SSE
_BLOCK4_KERNEL))
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_
VSX
_BLOCK6_KERNEL) && !defined(WITH_REAL_
VSX
_BLOCK4_KERNEL))
do
j
=
ncols
,
2
,
-2
do
j
=
ncols
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
call
double_hh_trafo_
&
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
MATH_DATATYPE
&
&
_
sse
_2hv_
&
&
_
vsx
_2hv_
&
&
PRECISION
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
#else
call
double_hh_trafo_
&
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
MATH_DATATYPE
&
&
_
sse
_2hv_
&
&
_
vsx
_2hv_
&
&
PRECISION
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
#endif
enddo
enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_
SSE
_BLOCK6_KERNEL) && !defined(WITH_REAL_
SSE
_BLOCK4_KERNEL)) */
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_
VSX
_BLOCK6_KERNEL) && !defined(WITH_REAL_
VSX
_BLOCK4_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL
#ifndef WITH_FIXED_REAL_KERNEL
endif
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_
SSE
_BLOCK2_KERNEL */
#endif /* WITH_REAL_
VSX
_BLOCK2_KERNEL */
#endif /* REALCASE == 1 */
#endif /* REALCASE == 1 */
...
@@ -885,6 +923,53 @@
...
@@ -885,6 +923,53 @@
#endif /* WITH_COMPLEX_SPARC64_BLOCK2_KERNEL */
#endif /* WITH_COMPLEX_SPARC64_BLOCK2_KERNEL */
#endif /* COMPLEXCASE == 1 */
#endif /* COMPLEXCASE == 1 */
#if COMPLEXCASE == 1
! implementation of vsx block 2 complex case
#if defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_VSX_BLOCK2
)
then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
ttt
=
mpi_wtime
()
do
j
=
ncols
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
#ifdef WITH_OPENMP
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
#ifdef WITH_OPENMP
if
(
j
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
1
+
off
+
a_off
,
istripe
,
my_thread
)),
bcast_buffer
(
1
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#else
if
(
j
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
1
+
off
+
a_off
,
istripe
)),
bcast_buffer
(
1
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#endif
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif
! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK2)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_VSX_BLOCK2_KERNEL */
#endif /* COMPLEXCASE == 1 */
#if COMPLEXCASE == 1
#if COMPLEXCASE == 1
! implementation of sse block 2 complex case
! implementation of sse block 2 complex case
...
@@ -1244,7 +1329,7 @@
...
@@ -1244,7 +1329,7 @@
&
(
a
(
1
:
stripe_width
,
1
+
off
+
a_off
:
1
+
off
+
a_off
+
nbw
-1
,
istripe
),
bcast_buffer
(
1
:
nbw
,
off
+1
),
nbw
,
nl
,
stripe_width
)
&
(
a
(
1
:
stripe_width
,
1
+
off
+
a_off
:
1
+
off
+
a_off
+
nbw
-1
,
istripe
),
bcast_buffer
(
1
:
nbw
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#endif
#endif
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_S
SE
_BLOCK6_KERNEL)) */
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_S
PARC64
_BLOCK6_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL
#ifndef WITH_FIXED_REAL_KERNEL
endif
endif
...
@@ -1253,6 +1338,77 @@
...
@@ -1253,6 +1338,77 @@
#endif /* REALCASE */
#endif /* REALCASE */
#if REALCASE == 1
! vsx block4 real kernel
#if defined(WITH_REAL_VSX_BLOCK4_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if
(
kernel
.eq.
ELPA_2STAGE_REAL_VSX_BLOCK4
)
then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL))
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do
j
=
ncols
,
4
,
-4
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
w
(:,
3
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-2
)
w
(:,
4
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-3
)
#ifdef WITH_OPENMP
call
quad_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_4hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-3
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
quad_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_4hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-3
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
do
jj
=
j
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
-1
)
#ifdef WITH_OPENMP
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jj
+
off
+
a_off
-1
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jj
+
off
+
a_off
-1
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
#ifdef WITH_OPENMP
if
(
jj
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
cpu_openmp_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
1
+
off
+
a_off
:
1
+
off
+
a_off
+
nbw
-1
,
istripe
,
my_thread
),
&
bcast_buffer
(
1
:
nbw
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#else
if
(
jj
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
cpu_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
1
+
off
+
a_off
:
1
+
off
+
a_off
+
nbw
-1
,
istripe
),
bcast_buffer
(
1
:
nbw
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#endif
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_VSX_BLOCK4_KERNEL */
#endif /* REALCASE */
#if REALCASE == 1
#if REALCASE == 1
! sse block4 real kernel
! sse block4 real kernel
...
@@ -1478,6 +1634,7 @@
...
@@ -1478,6 +1634,7 @@
!no avx512 block4 complex kernel
!no avx512 block4 complex kernel
#endif /* COMPLEXCASE */
#endif /* COMPLEXCASE */
#if REALCASE == 1
#if REALCASE == 1
!sparc64 block6 real kernel
!sparc64 block6 real kernel
#if defined(WITH_REAL_SPARC64_BLOCK6_KERNEL)
#if defined(WITH_REAL_SPARC64_BLOCK6_KERNEL)
...
@@ -1515,7 +1672,7 @@
...
@@ -1515,7 +1672,7 @@
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
call
quad_hh_trafo_
&
call
quad_hh_trafo_
&
&
MATH_DATATYPE
&
&
MATH_DATATYPE
&
&
_
sparc64_
_
4hv_
&
&
_
sparc64_4hv_
&
&
PRECISION
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jj
+
off
+
a_off
-3
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
&
(
c_loc
(
a
(
1
,
jj
+
off
+
a_off
-3
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
#else
...
@@ -1565,6 +1722,93 @@
...
@@ -1565,6 +1722,93 @@
#endif /* REALCASE */
#endif /* REALCASE */
#if REALCASE == 1
!vsx block6 real kernel
#if defined(WITH_REAL_VSX_BLOCK6_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if
(
kernel
.eq.
ELPA_2STAGE_REAL_VSX_BLOCK6
)
then
#endif /* not WITH_FIXED_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
do
j
=
ncols
,
6
,
-6