Commit fa3e9892 authored by Andreas Marek's avatar Andreas Marek

Real block2 double-precision kernel for Power8-Computer

parent ab06e091
......@@ -209,6 +209,13 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
if WITH_REAL_VSX_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_2hv_single_precision.c
endif
endif
if WITH_REAL_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
......@@ -237,13 +244,19 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
if WITH_REAL_SPARC64_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_single_precision.c
endif
endif
#if WITH_REAL_SPARC64_BLOCK4_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_single_precision.c
#endif
#endif
#
#if WITH_REAL_VSX_BLOCK4_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_single_precision.c
#endif
#endif
if WITH_REAL_SSE_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_4hv_double_precision.c
......@@ -273,13 +286,19 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
if WITH_REAL_SPARC64_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_single_precision.c
endif
endif
#if WITH_REAL_SPARC64_BLOCK6_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_single_precision.c
#endif
#endif
#
#if WITH_REAL_VSX_BLOCK6_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_6hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_6hv_single_precision.c
#endif
#endif
if WITH_REAL_SSE_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_6hv_double_precision.c
......@@ -309,13 +328,19 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
if WITH_COMPLEX_SPARC64_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_1hv_single_precision.c
endif
endif
#if WITH_COMPLEX_SPARC64_BLOCK1_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_1hv_double_precision.c
#if WANT_SINGLE_PRECISION_COMPLEX
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_1hv_single_precision.c
#endif
#endif
#
#if WITH_COMPLEX_VSX_BLOCK1_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_1hv_double_precision.c
#if WANT_SINGLE_PRECISION_COMPLEX
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_1hv_single_precision.c
#endif
#endif
if WITH_COMPLEX_SSE_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sse_1hv_double_precision.c
......@@ -346,12 +371,19 @@ if WANT_SINGLE_PRECISION_COMPLEX
endif
endif
if WITH_COMPLEX_SPARC64_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_2hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_2hv_single_precision.c
endif
endif
#if WITH_COMPLEX_SPARC64_BLOCK2_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_2hv_double_precision.c
#if WANT_SINGLE_PRECISION_COMPLEX
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_2hv_single_precision.c
#endif
#endif
#
#if WITH_COMPLEX_VSX_BLOCK2_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_2hv_double_precision.c
#if WANT_SINGLE_PRECISION_COMPLEX
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_2hv_single_precision.c
#endif
#endif
if WITH_COMPLEX_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sse_2hv_double_precision.c
......
......@@ -474,6 +474,14 @@ m4_define(elpa_m4_sparc64_kernels, [
complex_sparc64_block2
])
m4_define(elpa_m4_vsx_kernels, [
real_vsx_block2
real_vsx_block4
real_vsx_block6
complex_vsx_block1
complex_vsx_block2
])
m4_define(elpa_m4_avx_kernels, [
real_avx_block2
real_avx_block4
......@@ -513,7 +521,7 @@ m4_define(elpa_m4_gpu_kernels, [
complex_gpu
])
m4_define(elpa_m4_kernel_types, [generic sparc64 sse sse_assembly avx avx2 avx512 bgp bgq gpu])
m4_define(elpa_m4_kernel_types, [generic sparc64 vsx sse sse_assembly avx avx2 avx512 bgp bgq gpu])
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
......@@ -547,6 +555,7 @@ AC_DEFUN([ELPA_SELECT_KERNELS], [
dnl Modify list of kernels with configure arguments
ELPA_SELECT_KERNELS([generic],[enable])
ELPA_SELECT_KERNELS([sparc64],[disable])
ELPA_SELECT_KERNELS([vsx],[disable])
ELPA_SELECT_KERNELS([sse],[enable])
ELPA_SELECT_KERNELS([sse_assembly],[enable])
ELPA_SELECT_KERNELS([avx],[enable])
......@@ -561,7 +570,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
])
if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [
if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then
echo "Disabling elpa_m4_kernel due to BGP/BGQ option"
fi
......@@ -621,7 +630,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[
])
fi
])
m4_foreach_w([elpa_m4_arch],[sparc64 sse avx avx2 avx512],[
m4_foreach_w([elpa_m4_arch],[sparc64 vsx sse avx avx2 avx512],[
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1])
......@@ -655,7 +664,7 @@ dnl choosing a default kernel
m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_generic_kernels,
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[
if test -z "$default_[]elpa_m4_kind[]_kernel"; then
......@@ -681,6 +690,28 @@ dnl __m128d h1 = _fjsp_neg_v2r8(q);
dnl return 0;
dnl }
AC_LANG_PUSH([C])
if test x"${need_vsx}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile Altivec VSX with intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <altivec.h>
int main(int argc, char **argv) {
__vector double a, b, c;
c = vec_add(a,b);
return 0;
}
])],
[can_compile_vsx=yes],
[can_compile_vsx=no]
)
AC_MSG_RESULT([${can_compile_vsx}])
if test x"$can_compile_vsx" != x"yes"; then
AC_MSG_ERROR([Could not compile test program, try with --disable-vsx, or adjust the C compiler or CFLAGS])
fi
AC_DEFINE([HAVE_VSX_SSE],[1],[Altivec VSX intrinsics are supported on this CPU])
fi
if test x"${need_sparc64}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile SPARC64 with intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
......
......@@ -43,7 +43,10 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_GPU, 18, @ELPA_2STAGE_REAL_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 22, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 23, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 24, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
......@@ -69,7 +72,11 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2, 11, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1, 12, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2, 13, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1, 15, @ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2, 16, @ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_VSX_BLOCK1, 17, @ELPA_2STAGE_COMPLEX_VSX_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_VSX_BLOCK2, 18, @ELPA_2STAGE_COMPLEX_VSX_BLOCK2_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
......
This diff is collapsed.
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek, MPCDF
#include "config-f90.h"
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "../../general/precision_macros.h"
#include "real_vsx_2hv_template.c"
#undef REALCASE
#undef DOUBLE_PRECISION
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment