Commit 24867b0e authored by Andreas Marek's avatar Andreas Marek

Merge branch 'master' into ELPA_GPU

parents 3896c305 ebc097eb
......@@ -95,24 +95,44 @@ endif
endif
endif
if WITH_REAL_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c
endif
if WITH_REAL_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c
endif
if WITH_REAL_SSE_BLOCK4_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c
endif
if WITH_REAL_AVX_BLOCK4_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c
endif
if WITH_REAL_SSE_BLOCK6_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c
endif
if WITH_REAL_AVX_BLOCK6_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c
endif
if WITH_COMPLEX_SSE_BLOCK1_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp
endif
if WITH_COMPLEX_AVX_BLOCK1_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp
endif
if WITH_COMPLEX_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp
endif
if WITH_COMPLEX_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp
endif
.cu.lo:
......
......@@ -151,6 +151,10 @@ install_real_generic_simple=yes
install_complex_generic=yes
install_complex_generic_simple=yes
#want_avx=yes
#want_avx2=yes
#want_sse=yes
AC_LANG([C])
dnl build with ftimings support
......@@ -204,12 +208,26 @@ $CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s -o
if test "$?" == 0; then
can_compile_sse=yes
install_real_sse=yes
install_real_sse_block2=yes
install_real_sse_block4=yes
install_real_sse_block6=yes
install_complex_sse=yes
install_complex_sse_block1=yes
install_complex_sse_block2=yes
else
can_compile_sse=no
install_real_sse=no
install_real_sse_block2=no
install_real_sse_block4=no
install_real_sse_block6=no
install_complex_sse=no
install_complex_sse_block1=no
install_complex_sse_block2=no
fi
rm -f ./test.o
AC_MSG_RESULT([${can_compile_sse}])
......@@ -249,6 +267,7 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([
[can_compile_avx=no]
)
AC_MSG_RESULT([${can_compile_avx}])
if test "${can_compile_avx}" = "yes" ; then
AC_MSG_CHECKING([whether we can compile AVX intrinsics in C++])
AC_LANG_PUSH([C++])
......@@ -306,7 +325,6 @@ if test "${can_compile_avx2}" = "yes" ; then
fi
fi
if test "${can_compile_avx}" = "yes" ; then
install_real_avx_block2=yes
install_real_avx_block4=yes
......@@ -314,8 +332,6 @@ if test "${can_compile_avx}" = "yes" ; then
install_complex_avx_block1=yes
install_complex_avx_block2=yes
want_avx=yes
else
install_real_avx_block2=no
install_real_avx_block4=no
......@@ -323,8 +339,34 @@ else
install_complex_avx_block1=no
install_complex_avx_block2=no
fi
if test "${can_compile_avx2}" = "yes" ; then
install_real_avx2_block2=yes
install_real_avx2_block4=yes
install_real_avx2_block6=yes
want_avx=yes
install_complex_avx2_block1=yes
install_complex_avx2_block2=yes
else
install_real_avx2_block2=no
install_real_avx2_block4=no
install_real_avx2_block6=no
install_complex_avx2_block1=no
install_complex_avx2_block2=no
fi
AM_CONDITIONAL([HAVE_SSE],[test x"$can_compile_sse" = x"yes"])
if test x"${can_compile_sse}" = x"yes" ; then
AC_DEFINE([HAVE_SSE],[1],[SSE is supported on this CPU])
fi
AM_CONDITIONAL([HAVE_AVX],[test x"$can_compile_avx" = x"yes"])
if test x"${can_compile_avx}" = x"yes" ; then
AC_DEFINE([HAVE_AVX],[1],[AVX is supported on this CPU])
fi
AM_CONDITIONAL([HAVE_AVX2],[test x"$can_compile_avx2" = x"yes"])
if test x"${can_compile_avx2}" = x"yes" ; then
AC_DEFINE([HAVE_AVX2],[1],[AVX2 is supported on this CPU])
fi
dnl set the AVX optimization flags if this option is specified
......@@ -492,7 +534,6 @@ AC_MSG_CHECKING([whether we can use the intrinsic Fortran function "get_environm
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
program test_get_environment
character(len=256) :: homedir
call get_environment_variable("HOME",homedir)
end program
......@@ -638,6 +679,15 @@ DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgp-kernel-only],[bgp-kernel],[install_
dnl bgq kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgq-kernel-only],[bgq-kernel],[install_real_bgq])
dnl real-sse-block2 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block2-kernel-only],[real-sse-block2-kernel],[install_real_sse_block2])
dnl real-sse-block4 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block4-kernel]-only,[real-sse-block4-kernel],[install_real_sse_block4])
dnl real-sse-block6 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block6-kernel-only],[real-sse-block6-kernel],[install_real_sse_block6])
dnl real-avx-block2 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block2-kernel-only],[real-avx-block2-kernel],[install_real_avx_block2])
......@@ -664,6 +714,12 @@ DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgp-kernel-only],[bgp-kernel],[in
dnl complex-bqq kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgq-kernel-only],[bgq-kernel],[install_complex_bgq])
dnl complex-sse-block1 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-block1-kernel-only],[complex-sse-block1-kernel],[install_complex_sse_block1])
dnl complex-sse-block2 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-block2-kernel-only],[complex-sse-block2-kernel],[install_complex_sse_block2])
dnl complex-avx-block1 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block1-kernel-only],[complex-avx-block1-kernel],[install_complex_avx_block1])
......@@ -711,6 +767,21 @@ if test x"${install_complex_sse}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_KERNEL],[1],[can use complex SSE kernel])
fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK2_KERNEL],[test x"$install_real_sse_block2" = x"yes"])
if test x"${install_real_sse_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_BLOCK2_KERNEL],[1],[can use real_sse_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK4_KERNEL],[test x"$install_real_sse_block4" = x"yes"])
if test x"${install_real_sse_block4}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_BLOCK4_KERNEL],[1],[can use real_sse_block4 kernel])
fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK6_KERNEL],[test x"$install_real_sse_block6" = x"yes"])
if test x"${install_real_sse_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_BLOCK6_KERNEL],[1],[can use real_sse_block6 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX_BLOCK2_KERNEL],[test x"$install_real_avx_block2" = x"yes"])
if test x"${install_real_avx_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX_BLOCK2_KERNEL],[1],[can use real_avx_block2 kernel])
......@@ -726,6 +797,31 @@ if test x"${install_real_avx_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX_BLOCK6_KERNEL],[1],[can use real_avx_block6 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK2_KERNEL],[test x"$install_real_avx2_block2" = x"yes"])
if test x"${install_real_avx2_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK2_KERNEL],[1],[can use real_avx2_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK4_KERNEL],[test x"$install_real_avx2_block4" = x"yes"])
if test x"${install_real_avx2_block4}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK4_KERNEL],[1],[can use real_avx2_block4 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK6_KERNEL],[test x"$install_real_avx2_block6" = x"yes"])
if test x"${install_real_avx2_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK6_KERNEL],[1],[can use real_avx2_block6 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[test x"$install_complex_sse_block1" = x"yes"])
if test x"${install_complex_sse_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[1],[can use complex_sse_block1 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK2_KERNEL],[test x"$install_complex_sse_block2" = x"yes"])
if test x"${install_complex_sse_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_BLOCK2_KERNEL],[1],[can use complex_sse_block2 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[test x"$install_complex_avx_block1" = x"yes"])
if test x"${install_complex_avx_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[1],[can use complex_avx_block1 kernel])
......@@ -736,6 +832,16 @@ if test x"${install_complex_avx_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX_BLOCK2_KERNEL],[1],[can use complex_avx_block2 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX2_BLOCK1_KERNEL],[test x"$install_complex_avx2_block1" = x"yes"])
if test x"${install_complex_avx2_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK1_KERNEL],[1],[can use complex_avx2_block1 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[test x"$install_complex_avx2_block2" = x"yes"])
if test x"${install_complex_avx2_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[1],[can use complex_avx2_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_BGP_KERNEL],[test x"$install_real_bgp" = x"yes"])
if test x"${install_real_bgp}" = x"yes" ; then
AC_DEFINE([WITH_REAL_BGP_KERNEL],[1],[can use real BGP kernel])
......@@ -840,14 +946,14 @@ mkdir -p test/shared_sources
grep -h "^ *!c>" $srcdir/test/shared_sources/*.F90 | sed 's/^ *!c>//;' > test/shared_sources/generated.h || exit 1
if test "${can_compile_avx}" = "no" ; then
if test x"${want_avx}" = x"yes" ; then
# if test x"${want_avx}" = x"yes" ; then
AC_MSG_WARN([Could not compile AVX instructions])
fi
# fi
fi
if test "${can_compile_avx2}" = "no" ; then
if test x"${want_avx}" = x"yes" ; then
# if test x"${want_avx2}" = x"yes" ; then
AC_MSG_WARN([Could not compile AVX2 instructions])
fi
# fi
fi
if test "${can_compile_sse}" = "no" ; then
......
......@@ -3,21 +3,30 @@
#define ELPA2_REAL_KERNEL_BGP 3
#define ELPA2_REAL_KERNEL_BGQ 4
#define ELPA2_REAL_KERNEL_SSE 5
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 6
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 7
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 8
#define ELPA2_REAL_KERNEL_GPU 9
#define ELPA2_NUMBER_OF_REAL_KERNELS 9
#define ELPA2_REAL_KERNEL_SSE_BLOCK2 6
#define ELPA2_REAL_KERNEL_SSE_BLOCK4 7
#define ELPA2_REAL_KERNEL_SSE_BLOCK6 8
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 9
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 10
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 11
#define ELPA2_REAL_KERNEL_AVX2_BLOCK2 12
#define ELPA2_REAL_KERNEL_AVX2_BLOCK4 13
#define ELPA2_REAL_KERNEL_AVX2_BLOCK6 14
#define ELPA2_REAL_KERNEL_GPU 15
#define ELPA2_NUMBER_OF_REAL_KERNELS 15
#define ELPA2_COMPLEX_KERNEL_GENERIC 1
#define ELPA2_COMPLEX_KERNEL_GENERIC_SIMPLE 2
#define ELPA2_COMPLEX_KERNEL_BGP 3
#define ELPA2_COMPLEX_KERNEL_BGQ 4
#define ELPA2_COMPLEX_KERNEL_SSE 5
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 6
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 7
#define ELPA2_COMPLEX_KERNEL_GPU 8
#define ELPA2_COMPLEX_KERNEL_SSE_BLOCK1 6
#define ELPA2_COMPLEX_KERNEL_SSE_BLOCK2 7
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 8
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 9
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK1 10
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2 11
#define ELPA2_COMPLEX_KERNEL_GPU 12
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 8
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 12
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -73,14 +73,21 @@ module ELPA2_utilities
public :: get_actual_real_kernel_name, get_actual_complex_kernel_name
public :: REAL_ELPA_KERNEL_GENERIC, REAL_ELPA_KERNEL_GENERIC_SIMPLE, &
REAL_ELPA_KERNEL_BGP, REAL_ELPA_KERNEL_BGQ, &
REAL_ELPA_KERNEL_SSE, REAL_ELPA_KERNEL_AVX_BLOCK2, &
REAL_ELPA_KERNEL_SSE, REAL_ELPA_KERNEL_SSE_BLOCK2, &
REAL_ELPA_KERNEL_SSE_BLOCK4, REAL_ELPA_KERNEL_SSE_BLOCK6, &
REAL_ELPA_KERNEL_AVX_BLOCK2, &
REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6, &
REAL_ELPA_KERNEL_AVX2_BLOCK2, &
REAL_ELPA_KERNEL_AVX2_BLOCK4, REAL_ELPA_KERNEL_AVX2_BLOCK6, &
REAL_ELPA_KERNEL_GPU
public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, &
COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ, &
COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_AVX_BLOCK1, &
COMPLEX_ELPA_KERNEL_AVX_BLOCK2, &
COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_SSE_BLOCK1, &
COMPLEX_ELPA_KERNEL_SSE_BLOCK2, &
COMPLEX_ELPA_KERNEL_AVX_BLOCK1,COMPLEX_ELPA_KERNEL_AVX_BLOCK2, &
COMPLEX_ELPA_KERNEL_AVX2_BLOCK1,COMPLEX_ELPA_KERNEL_AVX2_BLOCK2, &
COMPLEX_ELPA_KERNEL_GPU
public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES
......@@ -95,17 +102,22 @@ module ELPA2_utilities
public :: qr_decomposition_via_environment_variable
integer(kind=ik), parameter :: number_of_real_kernels = ELPA2_NUMBER_OF_REAL_KERNELS
integer(kind=ik), parameter :: REAL_ELPA_KERNEL_GENERIC = ELPA2_REAL_KERNEL_GENERIC
integer(kind=ik), parameter :: REAL_ELPA_KERNEL_GENERIC_SIMPLE = ELPA2_REAL_KERNEL_GENERIC_SIMPLE
integer(kind=ik), parameter :: REAL_ELPA_KERNEL_BGP = ELPA2_REAL_KERNEL_BGP
integer(kind=ik), parameter :: REAL_ELPA_KERNEL_BGQ = ELPA2_REAL_KERNEL_BGQ
integer(kind=ik), parameter :: REAL_ELPA_KERNEL_SSE = ELPA2_REAL_KERNEL_SSE
integer(kind=ik), parameter :: REAL_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_REAL_KERNEL_AVX_BLOCK2
integer(kind=ik), parameter :: REAL_ELPA_KERNEL_AVX_BLOCK4 = ELPA2_REAL_KERNEL_AVX_BLOCK4
integer(kind=ik), parameter :: REAL_ELPA_KERNEL_AVX_BLOCK6 = ELPA2_REAL_KERNEL_AVX_BLOCK6
integer(kind=ik), parameter :: REAL_ELPA_KERNEL_GPU = ELPA2_REAL_KERNEL_GPU
integer, parameter :: number_of_real_kernels = ELPA2_NUMBER_OF_REAL_KERNELS
integer, parameter :: REAL_ELPA_KERNEL_GENERIC = ELPA2_REAL_KERNEL_GENERIC
integer, parameter :: REAL_ELPA_KERNEL_GENERIC_SIMPLE = ELPA2_REAL_KERNEL_GENERIC_SIMPLE
integer, parameter :: REAL_ELPA_KERNEL_BGP = ELPA2_REAL_KERNEL_BGP
integer, parameter :: REAL_ELPA_KERNEL_BGQ = ELPA2_REAL_KERNEL_BGQ
integer, parameter :: REAL_ELPA_KERNEL_SSE = ELPA2_REAL_KERNEL_SSE
integer, parameter :: REAL_ELPA_KERNEL_SSE_BLOCK2 = ELPA2_REAL_KERNEL_SSE_BLOCK2
integer, parameter :: REAL_ELPA_KERNEL_SSE_BLOCK4 = ELPA2_REAL_KERNEL_SSE_BLOCK4
integer, parameter :: REAL_ELPA_KERNEL_SSE_BLOCK6 = ELPA2_REAL_KERNEL_SSE_BLOCK6
integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_REAL_KERNEL_AVX_BLOCK2
integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK4 = ELPA2_REAL_KERNEL_AVX_BLOCK4
integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK6 = ELPA2_REAL_KERNEL_AVX_BLOCK6
integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK2 = ELPA2_REAL_KERNEL_AVX2_BLOCK2
integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK4 = ELPA2_REAL_KERNEL_AVX2_BLOCK4
integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK6 = ELPA2_REAL_KERNEL_AVX2_BLOCK6
integer(kind=ik), parameter :: REAL_ELPA_KERNEL_GPU = ELPA2_REAL_KERNEL_GPU
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
......@@ -187,21 +199,30 @@ module ELPA2_utilities
"REAL_ELPA_KERNEL_BGP ", &
"REAL_ELPA_KERNEL_BGQ ", &
"REAL_ELPA_KERNEL_SSE ", &
"REAL_ELPA_KERNEL_SSE_BLOCK2 ", &
"REAL_ELPA_KERNEL_SSE_BLOCK4 ", &
"REAL_ELPA_KERNEL_SSE_BLOCK6 ", &
"REAL_ELPA_KERNEL_AVX_BLOCK2 ", &
"REAL_ELPA_KERNEL_AVX_BLOCK4 ", &
"REAL_ELPA_KERNEL_AVX_BLOCK6 ", &
"REAL_ELPA_KERNEL_AVX2_BLOCK2 ", &
"REAL_ELPA_KERNEL_AVX2_BLOCK4 ", &
"REAL_ELPA_KERNEL_AVX2_BLOCK6 ", &
"REAL_ELPA_KERNEL_GPU "/)
integer(kind=ik), parameter :: number_of_complex_kernels = ELPA2_NUMBER_OF_COMPLEX_KERNELS
integer(kind=ik), parameter :: COMPLEX_ELPA_KERNEL_GENERIC = ELPA2_COMPLEX_KERNEL_GENERIC
integer(kind=ik), parameter :: COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE = ELPA2_COMPLEX_KERNEL_GENERIC_SIMPLE
integer(kind=ik), parameter :: COMPLEX_ELPA_KERNEL_BGP = ELPA2_COMPLEX_KERNEL_BGP
integer(kind=ik), parameter :: COMPLEX_ELPA_KERNEL_BGQ = ELPA2_COMPLEX_KERNEL_BGQ
integer(kind=ik), parameter :: COMPLEX_ELPA_KERNEL_SSE = ELPA2_COMPLEX_KERNEL_SSE
integer(kind=ik), parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK1 = ELPA2_COMPLEX_KERNEL_AVX_BLOCK1
integer(kind=ik), parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX_BLOCK2
integer(kind=ik), parameter :: COMPLEX_ELPA_KERNEL_GPU = ELPA2_COMPLEX_KERNEL_GPU
integer, parameter :: number_of_complex_kernels = ELPA2_NUMBER_OF_COMPLEX_KERNELS
integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC = ELPA2_COMPLEX_KERNEL_GENERIC
integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE = ELPA2_COMPLEX_KERNEL_GENERIC_SIMPLE
integer, parameter :: COMPLEX_ELPA_KERNEL_BGP = ELPA2_COMPLEX_KERNEL_BGP
integer, parameter :: COMPLEX_ELPA_KERNEL_BGQ = ELPA2_COMPLEX_KERNEL_BGQ
integer, parameter :: COMPLEX_ELPA_KERNEL_SSE = ELPA2_COMPLEX_KERNEL_SSE
integer, parameter :: COMPLEX_ELPA_KERNEL_SSE_BLOCK1 = ELPA2_COMPLEX_KERNEL_SSE_BLOCK1
integer, parameter :: COMPLEX_ELPA_KERNEL_SSE_BLOCK2 = ELPA2_COMPLEX_KERNEL_SSE_BLOCK2
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK1 = ELPA2_COMPLEX_KERNEL_AVX_BLOCK1
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX_BLOCK2
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX2_BLOCK1 = ELPA2_COMPLEX_KERNEL_AVX2_BLOCK1
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX2_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2
integer(kind=ik), parameter :: COMPLEX_ELPA_KERNEL_GPU = ELPA2_COMPLEX_KERNEL_GPU
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
......@@ -268,8 +289,12 @@ module ELPA2_utilities
"COMPLEX_ELPA_KERNEL_BGP ", &
"COMPLEX_ELPA_KERNEL_BGQ ", &
"COMPLEX_ELPA_KERNEL_SSE ", &
"COMPLEX_ELPA_KERNEL_SSE_BLOCK1 ", &
"COMPLEX_ELPA_KERNEL_SSE_BLOCK2 ", &
"COMPLEX_ELPA_KERNEL_AVX_BLOCK1 ", &
"COMPLEX_ELPA_KERNEL_AVX_BLOCK2 ", &
"COMPLEX_ELPA_KERNEL_AVX2_BLOCK1 ", &
"COMPLEX_ELPA_KERNEL_AVX2_BLOCK2 ", &
"COMPLEX_ELPA_KERNEL_GPU "/)
integer(kind=ik), parameter :: &
......@@ -300,27 +325,58 @@ module ELPA2_utilities
#else
,0 &
#endif
#if WITH_REAL_AVX_BLOCK2_KERNEL
#if WITH_REAL_SSE_BLOCK2_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_REAL_AVX_BLOCK4_KERNEL
#if WITH_REAL_SSE_BLOCK4_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_REAL_AVX_BLOCK6_KERNEL
#if WITH_REAL_SSE_BLOCK6_KERNEL
,1 &
#else
,0 &
#endif
#ifdef WITH_GPU_VERSION
#if WITH_REAL_AVX_BLOCK2_KERNEL
,1 &
#else
,0 &
#endif
/)
#if WITH_REAL_AVX_BLOCK4_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_REAL_AVX_BLOCK6_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_REAL_AVX2_BLOCK2_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_REAL_AVX2_BLOCK4_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_REAL_AVX2_BLOCK6_KERNEL
,1 &
#else
,0 &
#endif
#ifdef WITH_GPU_VERSION
,1 &
#else
,0 &
#endif
/)
integer(kind=ik), parameter :: &
AVAILABLE_COMPLEX_ELPA_KERNELS(number_of_complex_kernels) = &
......@@ -350,22 +406,42 @@ module ELPA2_utilities
#else
,0 &
#endif
#if WITH_COMPLEX_AVX_BLOCK1_KERNEL
#if WITH_COMPLEX_SSE_BLOCK1_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_COMPLEX_AVX_BLOCK2_KERNEL
#if WITH_COMPLEX_SSE_BLOCK2_KERNEL
,1 &
#else
,0 &
#endif
#ifdef WITH_GPU_VERSION
#if WITH_COMPLEX_AVX_BLOCK1_KERNEL
,1 &
#else
,0 &
#endif
/)
#if WITH_COMPLEX_AVX_BLOCK2_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_COMPLEX_AVX2_BLOCK1_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_COMPLEX_AVX2_BLOCK2_KERNEL
,1 &
#else
,0 &
#endif
#ifdef WITH_GPU_VERSION
,1 &
#else
,0 &
#endif
/)
!******
contains
......
......@@ -142,27 +142,56 @@ module compute_hh_trafo_complex
nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#endif
#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)
#if defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) then
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK2) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_complex_sse_2hv(a(1,j+off+a_off-1,istripe,my_thread), &
w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_complex_sse_2hv(a(1,j+off+a_off-1,istripe), &
w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP
if (j==1) call single_hh_trafo_complex_sse_1hv(a(1,1+off+a_off,istripe,my_thread), &
bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
if (j==1) call single_hh_trafo_complex_sse_1hv(a(1,1+off+a_off,istripe), &
bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */
#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK2) ) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_complex_sse_avx_2hv_double(a(1,j+off+a_off-1,istripe,my_thread), &
call double_hh_trafo_complex_avx_avx2_2hv(a(1,j+off+a_off-1,istripe,my_thread), &
w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_complex_sse_avx_2hv_double(a(1,j+off+a_off-1,istripe), &
call double_hh_trafo_complex_avx_avx2_2hv(a(1,j+off+a_off-1,istripe), &
w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP
if (j==1) call single_hh_trafo_complex_sse_avx_1hv_double(a(1,1+off+a_off,istripe,my_thread), &
if (j==1) call single_hh_trafo_complex_avx_avx2_1hv(a(1,1+off+a_off,istripe,my_thread), &
bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
if (j==1) call single_hh_trafo_complex_sse_avx_1hv_double(a(1,1+off+a_off,istripe), &
if (j==1) call single_hh_trafo_complex_avx_avx2_1hv(a(1,1+off+a_off,istripe), &
bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
......@@ -259,31 +288,51 @@ module compute_hh_trafo_complex
!#if defined(WITH_AVX_SANDYBRIDGE)
! call single_hh_trafo_complex_sse_avx_1hv_double(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
! call single_hh_trafo_complex_avx_avx2_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#endif
!#if defined(WITH_AMD_BULLDOZER)
! call single_hh_trafo_complex_sse_avx_1hv_double(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
! call single_hh_trafo_complex_avx_avx2_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#endif
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) then
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 1, -1
#ifdef WITH_OPENMP
call single_hh_trafo_complex_sse_avx_1hv_double_double(a(1,j+off+a_off,istripe,my_thread), &
call single_hh_trafo_complex_sse_1hv(a(1,j+off+a_off,istripe,my_thread), &
bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_complex_sse_avx_1hv_double(a(1,j+off+a_off,istripe), &
call single_hh_trafo_complex_sse_1hv(a(1,j+off+a_off,istripe), &
bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif