Commit 24867b0e authored by Andreas Marek's avatar Andreas Marek

Merge branch 'master' into ELPA_GPU

parents 3896c305 ebc097eb
......@@ -95,24 +95,44 @@ endif
endif
endif
if WITH_REAL_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c
endif
if WITH_REAL_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c
endif
if WITH_REAL_SSE_BLOCK4_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c
endif
if WITH_REAL_AVX_BLOCK4_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c
endif
if WITH_REAL_SSE_BLOCK6_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c
endif
if WITH_REAL_AVX_BLOCK6_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c
endif
if WITH_COMPLEX_SSE_BLOCK1_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp
endif
if WITH_COMPLEX_AVX_BLOCK1_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp
endif
if WITH_COMPLEX_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp
endif
if WITH_COMPLEX_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp
endif
.cu.lo:
......
......@@ -151,6 +151,10 @@ install_real_generic_simple=yes
install_complex_generic=yes
install_complex_generic_simple=yes
#want_avx=yes
#want_avx2=yes
#want_sse=yes
AC_LANG([C])
dnl build with ftimings support
......@@ -204,12 +208,26 @@ $CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s -o
if test "$?" == 0; then
can_compile_sse=yes
install_real_sse=yes
install_real_sse_block2=yes
install_real_sse_block4=yes
install_real_sse_block6=yes
install_complex_sse=yes
install_complex_sse_block1=yes
install_complex_sse_block2=yes
else
can_compile_sse=no
install_real_sse=no
install_real_sse_block2=no
install_real_sse_block4=no
install_real_sse_block6=no
install_complex_sse=no
install_complex_sse_block1=no
install_complex_sse_block2=no
fi
rm -f ./test.o
AC_MSG_RESULT([${can_compile_sse}])
......@@ -249,6 +267,7 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([
[can_compile_avx=no]
)
AC_MSG_RESULT([${can_compile_avx}])
if test "${can_compile_avx}" = "yes" ; then
AC_MSG_CHECKING([whether we can compile AVX intrinsics in C++])
AC_LANG_PUSH([C++])
......@@ -306,7 +325,6 @@ if test "${can_compile_avx2}" = "yes" ; then
fi
fi
if test "${can_compile_avx}" = "yes" ; then
install_real_avx_block2=yes
install_real_avx_block4=yes
......@@ -314,8 +332,6 @@ if test "${can_compile_avx}" = "yes" ; then
install_complex_avx_block1=yes
install_complex_avx_block2=yes
want_avx=yes
else
install_real_avx_block2=no
install_real_avx_block4=no
......@@ -323,8 +339,34 @@ else
install_complex_avx_block1=no
install_complex_avx_block2=no
fi
if test "${can_compile_avx2}" = "yes" ; then
install_real_avx2_block2=yes
install_real_avx2_block4=yes
install_real_avx2_block6=yes
want_avx=yes
install_complex_avx2_block1=yes
install_complex_avx2_block2=yes
else
install_real_avx2_block2=no
install_real_avx2_block4=no
install_real_avx2_block6=no
install_complex_avx2_block1=no
install_complex_avx2_block2=no
fi
AM_CONDITIONAL([HAVE_SSE],[test x"$can_compile_sse" = x"yes"])
if test x"${can_compile_sse}" = x"yes" ; then
AC_DEFINE([HAVE_SSE],[1],[SSE is supported on this CPU])
fi
AM_CONDITIONAL([HAVE_AVX],[test x"$can_compile_avx" = x"yes"])
if test x"${can_compile_avx}" = x"yes" ; then
AC_DEFINE([HAVE_AVX],[1],[AVX is supported on this CPU])
fi
AM_CONDITIONAL([HAVE_AVX2],[test x"$can_compile_avx2" = x"yes"])
if test x"${can_compile_avx2}" = x"yes" ; then
AC_DEFINE([HAVE_AVX2],[1],[AVX2 is supported on this CPU])
fi
dnl set the AVX optimization flags if this option is specified
......@@ -492,7 +534,6 @@ AC_MSG_CHECKING([whether we can use the intrinsic Fortran function "get_environm
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
program test_get_environment
character(len=256) :: homedir
call get_environment_variable("HOME",homedir)
end program
......@@ -638,6 +679,15 @@ DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgp-kernel-only],[bgp-kernel],[install_
dnl bgq kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgq-kernel-only],[bgq-kernel],[install_real_bgq])
dnl real-sse-block2 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block2-kernel-only],[real-sse-block2-kernel],[install_real_sse_block2])
dnl real-sse-block4 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block4-kernel]-only,[real-sse-block4-kernel],[install_real_sse_block4])
dnl real-sse-block6 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block6-kernel-only],[real-sse-block6-kernel],[install_real_sse_block6])
dnl real-avx-block2 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block2-kernel-only],[real-avx-block2-kernel],[install_real_avx_block2])
......@@ -664,6 +714,12 @@ DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgp-kernel-only],[bgp-kernel],[in
dnl complex-bqq kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgq-kernel-only],[bgq-kernel],[install_complex_bgq])
dnl complex-sse-block1 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-block1-kernel-only],[complex-sse-block1-kernel],[install_complex_sse_block1])
dnl complex-sse-block2 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-block2-kernel-only],[complex-sse-block2-kernel],[install_complex_sse_block2])
dnl complex-avx-block1 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block1-kernel-only],[complex-avx-block1-kernel],[install_complex_avx_block1])
......@@ -711,6 +767,21 @@ if test x"${install_complex_sse}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_KERNEL],[1],[can use complex SSE kernel])
fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK2_KERNEL],[test x"$install_real_sse_block2" = x"yes"])
if test x"${install_real_sse_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_BLOCK2_KERNEL],[1],[can use real_sse_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK4_KERNEL],[test x"$install_real_sse_block4" = x"yes"])
if test x"${install_real_sse_block4}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_BLOCK4_KERNEL],[1],[can use real_sse_block4 kernel])
fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK6_KERNEL],[test x"$install_real_sse_block6" = x"yes"])
if test x"${install_real_sse_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_BLOCK6_KERNEL],[1],[can use real_sse_block6 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX_BLOCK2_KERNEL],[test x"$install_real_avx_block2" = x"yes"])
if test x"${install_real_avx_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX_BLOCK2_KERNEL],[1],[can use real_avx_block2 kernel])
......@@ -726,6 +797,31 @@ if test x"${install_real_avx_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX_BLOCK6_KERNEL],[1],[can use real_avx_block6 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK2_KERNEL],[test x"$install_real_avx2_block2" = x"yes"])
if test x"${install_real_avx2_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK2_KERNEL],[1],[can use real_avx2_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK4_KERNEL],[test x"$install_real_avx2_block4" = x"yes"])
if test x"${install_real_avx2_block4}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK4_KERNEL],[1],[can use real_avx2_block4 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK6_KERNEL],[test x"$install_real_avx2_block6" = x"yes"])
if test x"${install_real_avx2_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK6_KERNEL],[1],[can use real_avx2_block6 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[test x"$install_complex_sse_block1" = x"yes"])
if test x"${install_complex_sse_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[1],[can use complex_sse_block1 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK2_KERNEL],[test x"$install_complex_sse_block2" = x"yes"])
if test x"${install_complex_sse_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_BLOCK2_KERNEL],[1],[can use complex_sse_block2 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[test x"$install_complex_avx_block1" = x"yes"])
if test x"${install_complex_avx_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[1],[can use complex_avx_block1 kernel])
......@@ -736,6 +832,16 @@ if test x"${install_complex_avx_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX_BLOCK2_KERNEL],[1],[can use complex_avx_block2 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX2_BLOCK1_KERNEL],[test x"$install_complex_avx2_block1" = x"yes"])
if test x"${install_complex_avx2_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK1_KERNEL],[1],[can use complex_avx2_block1 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[test x"$install_complex_avx2_block2" = x"yes"])
if test x"${install_complex_avx2_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[1],[can use complex_avx2_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_BGP_KERNEL],[test x"$install_real_bgp" = x"yes"])
if test x"${install_real_bgp}" = x"yes" ; then
AC_DEFINE([WITH_REAL_BGP_KERNEL],[1],[can use real BGP kernel])
......@@ -840,14 +946,14 @@ mkdir -p test/shared_sources
grep -h "^ *!c>" $srcdir/test/shared_sources/*.F90 | sed 's/^ *!c>//;' > test/shared_sources/generated.h || exit 1
if test "${can_compile_avx}" = "no" ; then
if test x"${want_avx}" = x"yes" ; then
# if test x"${want_avx}" = x"yes" ; then
AC_MSG_WARN([Could not compile AVX instructions])
fi
# fi
fi
if test "${can_compile_avx2}" = "no" ; then
if test x"${want_avx}" = x"yes" ; then
# if test x"${want_avx2}" = x"yes" ; then
AC_MSG_WARN([Could not compile AVX2 instructions])
fi
# fi
fi
if test "${can_compile_sse}" = "no" ; then
......
......@@ -3,21 +3,30 @@
#define ELPA2_REAL_KERNEL_BGP 3
#define ELPA2_REAL_KERNEL_BGQ 4
#define ELPA2_REAL_KERNEL_SSE 5
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 6
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 7
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 8
#define ELPA2_REAL_KERNEL_GPU 9
#define ELPA2_NUMBER_OF_REAL_KERNELS 9
#define ELPA2_REAL_KERNEL_SSE_BLOCK2 6
#define ELPA2_REAL_KERNEL_SSE_BLOCK4 7
#define ELPA2_REAL_KERNEL_SSE_BLOCK6 8
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 9
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 10
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 11
#define ELPA2_REAL_KERNEL_AVX2_BLOCK2 12
#define ELPA2_REAL_KERNEL_AVX2_BLOCK4 13
#define ELPA2_REAL_KERNEL_AVX2_BLOCK6 14
#define ELPA2_REAL_KERNEL_GPU 15
#define ELPA2_NUMBER_OF_REAL_KERNELS 15
#define ELPA2_COMPLEX_KERNEL_GENERIC 1
#define ELPA2_COMPLEX_KERNEL_GENERIC_SIMPLE 2
#define ELPA2_COMPLEX_KERNEL_BGP 3
#define ELPA2_COMPLEX_KERNEL_BGQ 4
#define ELPA2_COMPLEX_KERNEL_SSE 5
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 6
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 7
#define ELPA2_COMPLEX_KERNEL_GPU 8
#define ELPA2_COMPLEX_KERNEL_SSE_BLOCK1 6
#define ELPA2_COMPLEX_KERNEL_SSE_BLOCK2 7
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 8
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 9
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK1 10
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2 11
#define ELPA2_COMPLEX_KERNEL_GPU 12
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 8
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 12
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment