Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
elpa
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
10
Issues
10
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Environments
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
elpa
elpa
Commits
fa3e9892
Commit
fa3e9892
authored
Nov 18, 2017
by
Andreas Marek
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Real block2 double-precision kernel for Power8-Computer
parent
ab06e091
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
2359 additions
and
44 deletions
+2359
-44
Makefile.am
Makefile.am
+59
-27
configure.ac
configure.ac
+35
-4
elpa/elpa_constants.h.in
elpa/elpa_constants.h.in
+9
-2
src/elpa2/compute_hh_trafo.F90
src/elpa2/compute_hh_trafo.F90
+255
-11
src/elpa2/kernels/real_vsx_2hv_double_precision.c
src/elpa2/kernels/real_vsx_2hv_double_precision.c
+55
-0
src/elpa2/kernels/real_vsx_2hv_template.c
src/elpa2/kernels/real_vsx_2hv_template.c
+1946
-0
No files found.
Makefile.am
View file @
fa3e9892
...
...
@@ -209,6 +209,13 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
if
WITH_REAL_VSX_BLOCK2_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_vsx_2hv_double_precision.c
if
WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_vsx_2hv_single_precision.c
endif
endif
if
WITH_REAL_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_sse_2hv_double_precision.c
if
WANT_SINGLE_PRECISION_REAL
...
...
@@ -237,13 +244,19 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
if
WITH_REAL_SPARC64_BLOCK4_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_sparc64_4hv_double_precision.c
if
WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_sparc64_4hv_single_precision.c
endif
endif
#if WITH_REAL_SPARC64_BLOCK4_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_single_precision.c
#endif
#endif
#
#if WITH_REAL_VSX_BLOCK4_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_single_precision.c
#endif
#endif
if
WITH_REAL_SSE_BLOCK4_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_sse_4hv_double_precision.c
...
...
@@ -273,13 +286,19 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
if
WITH_REAL_SPARC64_BLOCK6_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_sparc64_6hv_double_precision.c
if
WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_sparc64_6hv_single_precision.c
endif
endif
#if WITH_REAL_SPARC64_BLOCK6_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_single_precision.c
#endif
#endif
#
#if WITH_REAL_VSX_BLOCK6_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_6hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_6hv_single_precision.c
#endif
#endif
if
WITH_REAL_SSE_BLOCK6_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_sse_6hv_double_precision.c
...
...
@@ -309,13 +328,19 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
if
WITH_COMPLEX_SPARC64_BLOCK1_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/complex_sparc64_1hv_double_precision.c
if
WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/complex_sparc64_1hv_single_precision.c
endif
endif
#if WITH_COMPLEX_SPARC64_BLOCK1_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_1hv_double_precision.c
#if WANT_SINGLE_PRECISION_COMPLEX
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_1hv_single_precision.c
#endif
#endif
#
#if WITH_COMPLEX_VSX_BLOCK1_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_1hv_double_precision.c
#if WANT_SINGLE_PRECISION_COMPLEX
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_1hv_single_precision.c
#endif
#endif
if
WITH_COMPLEX_SSE_BLOCK1_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/complex_sse_1hv_double_precision.c
...
...
@@ -346,12 +371,19 @@ if WANT_SINGLE_PRECISION_COMPLEX
endif
endif
if
WITH_COMPLEX_SPARC64_BLOCK2_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/complex_sparc64_2hv_double_precision.c
if
WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/complex_sparc64_2hv_single_precision.c
endif
endif
#if WITH_COMPLEX_SPARC64_BLOCK2_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_2hv_double_precision.c
#if WANT_SINGLE_PRECISION_COMPLEX
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_2hv_single_precision.c
#endif
#endif
#
#if WITH_COMPLEX_VSX_BLOCK2_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_2hv_double_precision.c
#if WANT_SINGLE_PRECISION_COMPLEX
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_2hv_single_precision.c
#endif
#endif
if
WITH_COMPLEX_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/complex_sse_2hv_double_precision.c
...
...
configure.ac
View file @
fa3e9892
...
...
@@ -474,6 +474,14 @@ m4_define(elpa_m4_sparc64_kernels, [
complex_sparc64_block2
])
m4_define(elpa_m4_vsx_kernels, [
real_vsx_block2
real_vsx_block4
real_vsx_block6
complex_vsx_block1
complex_vsx_block2
])
m4_define(elpa_m4_avx_kernels, [
real_avx_block2
real_avx_block4
...
...
@@ -513,7 +521,7 @@ m4_define(elpa_m4_gpu_kernels, [
complex_gpu
])
m4_define(elpa_m4_kernel_types, [generic sparc64 sse sse_assembly avx avx2 avx512 bgp bgq gpu])
m4_define(elpa_m4_kernel_types, [generic sparc64
vsx
sse sse_assembly avx avx2 avx512 bgp bgq gpu])
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
...
...
@@ -547,6 +555,7 @@ AC_DEFUN([ELPA_SELECT_KERNELS], [
dnl Modify list of kernels with configure arguments
ELPA_SELECT_KERNELS([generic],[enable])
ELPA_SELECT_KERNELS([sparc64],[disable])
ELPA_SELECT_KERNELS([vsx],[disable])
ELPA_SELECT_KERNELS([sse],[enable])
ELPA_SELECT_KERNELS([sse_assembly],[enable])
ELPA_SELECT_KERNELS([avx],[enable])
...
...
@@ -561,7 +570,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
])
if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_
vsx_kernels elpa_m4_
sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [
if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then
echo "Disabling elpa_m4_kernel due to BGP/BGQ option"
fi
...
...
@@ -621,7 +630,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[
])
fi
])
m4_foreach_w([elpa_m4_arch],[sparc64 sse avx avx2 avx512],[
m4_foreach_w([elpa_m4_arch],[sparc64
vsx
sse avx avx2 avx512],[
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1])
...
...
@@ -655,7 +664,7 @@ dnl choosing a default kernel
m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_generic_kernels,
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_
vsx_kernels elpa_m4_
generic_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[
if test -z "$default_[]elpa_m4_kind[]_kernel"; then
...
...
@@ -681,6 +690,28 @@ dnl __m128d h1 = _fjsp_neg_v2r8(q);
dnl return 0;
dnl }
AC_LANG_PUSH([C])
if test x"${need_vsx}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile Altivec VSX with intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <altivec.h>
int main(int argc, char **argv) {
__vector double a, b, c;
c = vec_add(a,b);
return 0;
}
])],
[can_compile_vsx=yes],
[can_compile_vsx=no]
)
AC_MSG_RESULT([${can_compile_vsx}])
if test x"$can_compile_vsx" != x"yes"; then
AC_MSG_ERROR([Could not compile test program, try with --disable-vsx, or adjust the C compiler or CFLAGS])
fi
AC_DEFINE([HAVE_VSX_SSE],[1],[Altivec VSX intrinsics are supported on this CPU])
fi
if test x"${need_sparc64}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile SPARC64 with intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
...
...
elpa/elpa_constants.h.in
View file @
fa3e9892
...
...
@@ -43,7 +43,10 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_GPU, 18, @ELPA_2STAGE_REAL_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 22, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 23, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 24, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
...
...
@@ -69,7 +72,11 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2, 11, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1, 12, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2, 13, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1, 15, @ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2, 16, @ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_VSX_BLOCK1, 17, @ELPA_2STAGE_COMPLEX_VSX_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_VSX_BLOCK2, 18, @ELPA_2STAGE_COMPLEX_VSX_BLOCK2_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
...
...
src/elpa2/compute_hh_trafo.F90
View file @
fa3e9892
...
...
@@ -329,6 +329,7 @@
kernel
.eq.
ELPA_2STAGE_REAL_AVX512_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SSE_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SPARC64_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_VSX_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_GENERIC
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_GENERIC_SIMPLE
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SSE_ASSEMBLY
.or.
&
...
...
@@ -620,7 +621,7 @@
#endif /* COMPLEXCASE */
#if REALCASE == 1
! no sse block1 real kernel
! no sse
, vsx, sparc64
block1 real kernel
#endif
#if COMPLEXCASE == 1
...
...
@@ -658,6 +659,41 @@
#endif /* COMPLEXCASE */
#if COMPLEXCASE == 1
! vsx block1 complex kernel
#if defined(WITH_COMPLEX_VSX_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_VSX_BLOCK1
)
then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL))
ttt
=
mpi_wtime
()
do
j
=
ncols
,
1
,
-1
#ifdef WITH_OPENMP
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
,
my_thread
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#else
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#endif
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)) */
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif
! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_VSX_BLOCK1_KERNEL */
#endif /* COMPLEXCASE */
#if COMPLEXCASE == 1
! sse block1 complex kernel
...
...
@@ -803,39 +839,41 @@
#endif /* WITH_REAL_SPARC64_BLOCK2_KERNEL */
#endif /* REALCASE == 1 */
#if REALCASE == 1
! implementation of
sparc64
block 2 real case
#if defined(WITH_REAL_
SPARC64
_BLOCK2_KERNEL)
! implementation of
vsx
block 2 real case
#if defined(WITH_REAL_
VSX
_BLOCK2_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if
(
kernel
.eq.
ELPA_2STAGE_REAL_
SSE
_BLOCK2
)
then
if
(
kernel
.eq.
ELPA_2STAGE_REAL_
VSX
_BLOCK2
)
then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_
SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE
_BLOCK4_KERNEL))
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_
VSX_BLOCK6_KERNEL) && !defined(WITH_REAL_VSX
_BLOCK4_KERNEL))
do
j
=
ncols
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
#ifdef WITH_OPENMP
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sse
_2hv_
&
&
_
vsx
_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sse
_2hv_
&
&
_
vsx
_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_
SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE
_BLOCK4_KERNEL)) */
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_
VSX_BLOCK6_KERNEL) && !defined(WITH_REAL_VSX
_BLOCK4_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_
SSE
_BLOCK2_KERNEL */
#endif /* WITH_REAL_
VSX
_BLOCK2_KERNEL */
#endif /* REALCASE == 1 */
...
...
@@ -885,6 +923,53 @@
#endif /* WITH_COMPLEX_SPARC64_BLOCK2_KERNEL */
#endif /* COMPLEXCASE == 1 */
#if COMPLEXCASE == 1
! implementation of vsx block 2 complex case
#if defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_VSX_BLOCK2
)
then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
ttt
=
mpi_wtime
()
do
j
=
ncols
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
#ifdef WITH_OPENMP
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
#ifdef WITH_OPENMP
if
(
j
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
1
+
off
+
a_off
,
istripe
,
my_thread
)),
bcast_buffer
(
1
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#else
if
(
j
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
1
+
off
+
a_off
,
istripe
)),
bcast_buffer
(
1
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#endif
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif
! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK2)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_VSX_BLOCK2_KERNEL */
#endif /* COMPLEXCASE == 1 */
#if COMPLEXCASE == 1
! implementation of sse block 2 complex case
...
...
@@ -1244,7 +1329,7 @@
&
(
a
(
1
:
stripe_width
,
1
+
off
+
a_off
:
1
+
off
+
a_off
+
nbw
-1
,
istripe
),
bcast_buffer
(
1
:
nbw
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#endif
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_S
SE
_BLOCK6_KERNEL)) */
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_S
PARC64
_BLOCK6_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL
endif
...
...
@@ -1253,6 +1338,77 @@
#endif /* REALCASE */
#if REALCASE == 1
! vsx block4 real kernel
#if defined(WITH_REAL_VSX_BLOCK4_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if
(
kernel
.eq.
ELPA_2STAGE_REAL_VSX_BLOCK4
)
then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL))
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do
j
=
ncols
,
4
,
-4
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
w
(:,
3
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-2
)
w
(:,
4
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-3
)
#ifdef WITH_OPENMP
call
quad_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_4hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-3
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
quad_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_4hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-3
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
do
jj
=
j
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
-1
)
#ifdef WITH_OPENMP
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jj
+
off
+
a_off
-1
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jj
+
off
+
a_off
-1
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
#ifdef WITH_OPENMP
if
(
jj
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
cpu_openmp_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
1
+
off
+
a_off
:
1
+
off
+
a_off
+
nbw
-1
,
istripe
,
my_thread
),
&
bcast_buffer
(
1
:
nbw
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#else
if
(
jj
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
cpu_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
1
+
off
+
a_off
:
1
+
off
+
a_off
+
nbw
-1
,
istripe
),
bcast_buffer
(
1
:
nbw
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#endif
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_VSX_BLOCK4_KERNEL */
#endif /* REALCASE */
#if REALCASE == 1
! sse block4 real kernel
...
...
@@ -1478,6 +1634,7 @@
!no avx512 block4 complex kernel
#endif /* COMPLEXCASE */
#if REALCASE == 1
!sparc64 block6 real kernel
#if defined(WITH_REAL_SPARC64_BLOCK6_KERNEL)
...
...
@@ -1515,7 +1672,7 @@
#ifdef WITH_OPENMP
call
quad_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sparc64_
_
4hv_
&
&
_
sparc64_4hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jj
+
off
+
a_off
-3
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
...
...
@@ -1565,6 +1722,93 @@
#endif /* REALCASE */
#if REALCASE == 1
!vsx block6 real kernel
#if defined(WITH_REAL_VSX_BLOCK6_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if
(
kernel
.eq.
ELPA_2STAGE_REAL_VSX_BLOCK6
)
then
#endif /* not WITH_FIXED_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
do
j
=
ncols
,
6
,
-6
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
w
(:,
3
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-2
)
w
(:,
4
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-3
)
w
(:,
5
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-4
)
w
(:,
6
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-5
)
#ifdef WITH_OPENMP
call
hexa_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_6hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-5
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
hexa_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_6hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-5
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
do
jj
=
j
,
4
,
-4
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
-1
)
w
(:,
3
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
-2
)
w
(:,
4
)
=
bcast_buffer
(
1
:
nbw
,
jj
+
off
-3
)
#ifdef WITH_OPENMP
call
quad_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_4hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jj
+
off
+
a_off
-3
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
quad_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_4hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jj
+
off
+
a_off
-3
,
istripe
)),
w
,
&
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
do
jjj
=
jj
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
jjj
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
jjj
+
off
-1
)
#ifdef WITH_OPENMP
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jjj
+
off
+
a_off
-1
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
vsx_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
jjj
+
off
+
a_off
-1
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
#ifdef WITH_OPENMP
if
(
jjj
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
cpu_openmp_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
1
+
off
+
a_off
:
1
+
off
+
a_off
+
nbw
-1
,
istripe
,
my_thread
),
&
bcast_buffer
(
1
:
nbw
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#else
if
(
jjj
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
cpu_
&
&
PRECISION
&
&
(
a
(
1
:
stripe_width
,
1
+
off
+
a_off
:
1
+
off
+
a_off
+
nbw
-1
,
istripe
),
bcast_buffer
(
1
:
nbw
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#endif
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_VSX_BLOCK6_KERNEL */
#endif /* REALCASE */
#if REALCASE == 1
!sse block6 real kernel
#if defined(WITH_REAL_SSE_BLOCK6_KERNEL)
...
...
src/elpa2/kernels/real_vsx_2hv_double_precision.c
0 → 100644
View file @
fa3e9892
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//