Commit 24867b0e authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'master' into ELPA_GPU

parents 3896c305 ebc097eb
...@@ -95,24 +95,44 @@ endif ...@@ -95,24 +95,44 @@ endif
endif endif
endif endif
if WITH_REAL_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c
endif
if WITH_REAL_AVX_BLOCK2_KERNEL if WITH_REAL_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c
endif
if WITH_REAL_SSE_BLOCK4_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c
endif endif
if WITH_REAL_AVX_BLOCK4_KERNEL if WITH_REAL_AVX_BLOCK4_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c
endif
if WITH_REAL_SSE_BLOCK6_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c
endif endif
if WITH_REAL_AVX_BLOCK6_KERNEL if WITH_REAL_AVX_BLOCK6_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c
endif
if WITH_COMPLEX_SSE_BLOCK1_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp
endif endif
if WITH_COMPLEX_AVX_BLOCK1_KERNEL if WITH_COMPLEX_AVX_BLOCK1_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp
endif
if WITH_COMPLEX_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp
endif endif
if WITH_COMPLEX_AVX_BLOCK2_KERNEL if WITH_COMPLEX_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp
endif endif
.cu.lo: .cu.lo:
......
...@@ -151,6 +151,10 @@ install_real_generic_simple=yes ...@@ -151,6 +151,10 @@ install_real_generic_simple=yes
install_complex_generic=yes install_complex_generic=yes
install_complex_generic_simple=yes install_complex_generic_simple=yes
#want_avx=yes
#want_avx2=yes
#want_sse=yes
AC_LANG([C]) AC_LANG([C])
dnl build with ftimings support dnl build with ftimings support
...@@ -204,12 +208,26 @@ $CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s -o ...@@ -204,12 +208,26 @@ $CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s -o
if test "$?" == 0; then if test "$?" == 0; then
can_compile_sse=yes can_compile_sse=yes
install_real_sse=yes install_real_sse=yes
install_real_sse_block2=yes
install_real_sse_block4=yes
install_real_sse_block6=yes
install_complex_sse=yes install_complex_sse=yes
install_complex_sse_block1=yes
install_complex_sse_block2=yes
else else
can_compile_sse=no can_compile_sse=no
install_real_sse=no install_real_sse=no
install_real_sse_block2=no
install_real_sse_block4=no
install_real_sse_block6=no
install_complex_sse=no install_complex_sse=no
install_complex_sse_block1=no
install_complex_sse_block2=no
fi fi
rm -f ./test.o rm -f ./test.o
AC_MSG_RESULT([${can_compile_sse}]) AC_MSG_RESULT([${can_compile_sse}])
...@@ -249,6 +267,7 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([ ...@@ -249,6 +267,7 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([
[can_compile_avx=no] [can_compile_avx=no]
) )
AC_MSG_RESULT([${can_compile_avx}]) AC_MSG_RESULT([${can_compile_avx}])
if test "${can_compile_avx}" = "yes" ; then if test "${can_compile_avx}" = "yes" ; then
AC_MSG_CHECKING([whether we can compile AVX intrinsics in C++]) AC_MSG_CHECKING([whether we can compile AVX intrinsics in C++])
AC_LANG_PUSH([C++]) AC_LANG_PUSH([C++])
...@@ -306,7 +325,6 @@ if test "${can_compile_avx2}" = "yes" ; then ...@@ -306,7 +325,6 @@ if test "${can_compile_avx2}" = "yes" ; then
fi fi
fi fi
if test "${can_compile_avx}" = "yes" ; then if test "${can_compile_avx}" = "yes" ; then
install_real_avx_block2=yes install_real_avx_block2=yes
install_real_avx_block4=yes install_real_avx_block4=yes
...@@ -314,8 +332,6 @@ if test "${can_compile_avx}" = "yes" ; then ...@@ -314,8 +332,6 @@ if test "${can_compile_avx}" = "yes" ; then
install_complex_avx_block1=yes install_complex_avx_block1=yes
install_complex_avx_block2=yes install_complex_avx_block2=yes
want_avx=yes
else else
install_real_avx_block2=no install_real_avx_block2=no
install_real_avx_block4=no install_real_avx_block4=no
...@@ -323,8 +339,34 @@ else ...@@ -323,8 +339,34 @@ else
install_complex_avx_block1=no install_complex_avx_block1=no
install_complex_avx_block2=no install_complex_avx_block2=no
fi
if test "${can_compile_avx2}" = "yes" ; then
install_real_avx2_block2=yes
install_real_avx2_block4=yes
install_real_avx2_block6=yes
want_avx=yes install_complex_avx2_block1=yes
install_complex_avx2_block2=yes
else
install_real_avx2_block2=no
install_real_avx2_block4=no
install_real_avx2_block6=no
install_complex_avx2_block1=no
install_complex_avx2_block2=no
fi
AM_CONDITIONAL([HAVE_SSE],[test x"$can_compile_sse" = x"yes"])
if test x"${can_compile_sse}" = x"yes" ; then
AC_DEFINE([HAVE_SSE],[1],[SSE is supported on this CPU])
fi
AM_CONDITIONAL([HAVE_AVX],[test x"$can_compile_avx" = x"yes"])
if test x"${can_compile_avx}" = x"yes" ; then
AC_DEFINE([HAVE_AVX],[1],[AVX is supported on this CPU])
fi
AM_CONDITIONAL([HAVE_AVX2],[test x"$can_compile_avx2" = x"yes"])
if test x"${can_compile_avx2}" = x"yes" ; then
AC_DEFINE([HAVE_AVX2],[1],[AVX2 is supported on this CPU])
fi fi
dnl set the AVX optimization flags if this option is specified dnl set the AVX optimization flags if this option is specified
...@@ -492,7 +534,6 @@ AC_MSG_CHECKING([whether we can use the intrinsic Fortran function "get_environm ...@@ -492,7 +534,6 @@ AC_MSG_CHECKING([whether we can use the intrinsic Fortran function "get_environm
AC_COMPILE_IFELSE([AC_LANG_SOURCE([ AC_COMPILE_IFELSE([AC_LANG_SOURCE([
program test_get_environment program test_get_environment
character(len=256) :: homedir character(len=256) :: homedir
call get_environment_variable("HOME",homedir) call get_environment_variable("HOME",homedir)
end program end program
...@@ -638,6 +679,15 @@ DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgp-kernel-only],[bgp-kernel],[install_ ...@@ -638,6 +679,15 @@ DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgp-kernel-only],[bgp-kernel],[install_
dnl bgq kernel dnl bgq kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgq-kernel-only],[bgq-kernel],[install_real_bgq]) DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgq-kernel-only],[bgq-kernel],[install_real_bgq])
dnl real-sse-block2 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block2-kernel-only],[real-sse-block2-kernel],[install_real_sse_block2])
dnl real-sse-block4 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block4-kernel]-only,[real-sse-block4-kernel],[install_real_sse_block4])
dnl real-sse-block6 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block6-kernel-only],[real-sse-block6-kernel],[install_real_sse_block6])
dnl real-avx-block2 kernel dnl real-avx-block2 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block2-kernel-only],[real-avx-block2-kernel],[install_real_avx_block2]) DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block2-kernel-only],[real-avx-block2-kernel],[install_real_avx_block2])
...@@ -664,6 +714,12 @@ DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgp-kernel-only],[bgp-kernel],[in ...@@ -664,6 +714,12 @@ DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgp-kernel-only],[bgp-kernel],[in
dnl complex-bqq kernel dnl complex-bqq kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgq-kernel-only],[bgq-kernel],[install_complex_bgq]) DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgq-kernel-only],[bgq-kernel],[install_complex_bgq])
dnl complex-sse-block1 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-block1-kernel-only],[complex-sse-block1-kernel],[install_complex_sse_block1])
dnl complex-sse-block2 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-block2-kernel-only],[complex-sse-block2-kernel],[install_complex_sse_block2])
dnl complex-avx-block1 kernel dnl complex-avx-block1 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block1-kernel-only],[complex-avx-block1-kernel],[install_complex_avx_block1]) DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block1-kernel-only],[complex-avx-block1-kernel],[install_complex_avx_block1])
...@@ -711,6 +767,21 @@ if test x"${install_complex_sse}" = x"yes" ; then ...@@ -711,6 +767,21 @@ if test x"${install_complex_sse}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_KERNEL],[1],[can use complex SSE kernel]) AC_DEFINE([WITH_COMPLEX_SSE_KERNEL],[1],[can use complex SSE kernel])
fi fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK2_KERNEL],[test x"$install_real_sse_block2" = x"yes"])
if test x"${install_real_sse_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_BLOCK2_KERNEL],[1],[can use real_sse_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK4_KERNEL],[test x"$install_real_sse_block4" = x"yes"])
if test x"${install_real_sse_block4}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_BLOCK4_KERNEL],[1],[can use real_sse_block4 kernel])
fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK6_KERNEL],[test x"$install_real_sse_block6" = x"yes"])
if test x"${install_real_sse_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_BLOCK6_KERNEL],[1],[can use real_sse_block6 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX_BLOCK2_KERNEL],[test x"$install_real_avx_block2" = x"yes"]) AM_CONDITIONAL([WITH_REAL_AVX_BLOCK2_KERNEL],[test x"$install_real_avx_block2" = x"yes"])
if test x"${install_real_avx_block2}" = x"yes" ; then if test x"${install_real_avx_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX_BLOCK2_KERNEL],[1],[can use real_avx_block2 kernel]) AC_DEFINE([WITH_REAL_AVX_BLOCK2_KERNEL],[1],[can use real_avx_block2 kernel])
...@@ -726,6 +797,31 @@ if test x"${install_real_avx_block6}" = x"yes" ; then ...@@ -726,6 +797,31 @@ if test x"${install_real_avx_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX_BLOCK6_KERNEL],[1],[can use real_avx_block6 kernel]) AC_DEFINE([WITH_REAL_AVX_BLOCK6_KERNEL],[1],[can use real_avx_block6 kernel])
fi fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK2_KERNEL],[test x"$install_real_avx2_block2" = x"yes"])
if test x"${install_real_avx2_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK2_KERNEL],[1],[can use real_avx2_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK4_KERNEL],[test x"$install_real_avx2_block4" = x"yes"])
if test x"${install_real_avx2_block4}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK4_KERNEL],[1],[can use real_avx2_block4 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK6_KERNEL],[test x"$install_real_avx2_block6" = x"yes"])
if test x"${install_real_avx2_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK6_KERNEL],[1],[can use real_avx2_block6 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[test x"$install_complex_sse_block1" = x"yes"])
if test x"${install_complex_sse_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[1],[can use complex_sse_block1 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK2_KERNEL],[test x"$install_complex_sse_block2" = x"yes"])
if test x"${install_complex_sse_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_BLOCK2_KERNEL],[1],[can use complex_sse_block2 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[test x"$install_complex_avx_block1" = x"yes"]) AM_CONDITIONAL([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[test x"$install_complex_avx_block1" = x"yes"])
if test x"${install_complex_avx_block1}" = x"yes" ; then if test x"${install_complex_avx_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[1],[can use complex_avx_block1 kernel]) AC_DEFINE([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[1],[can use complex_avx_block1 kernel])
...@@ -736,6 +832,16 @@ if test x"${install_complex_avx_block2}" = x"yes" ; then ...@@ -736,6 +832,16 @@ if test x"${install_complex_avx_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX_BLOCK2_KERNEL],[1],[can use complex_avx_block2 kernel]) AC_DEFINE([WITH_COMPLEX_AVX_BLOCK2_KERNEL],[1],[can use complex_avx_block2 kernel])
fi fi
AM_CONDITIONAL([WITH_COMPLEX_AVX2_BLOCK1_KERNEL],[test x"$install_complex_avx2_block1" = x"yes"])
if test x"${install_complex_avx2_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK1_KERNEL],[1],[can use complex_avx2_block1 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[test x"$install_complex_avx2_block2" = x"yes"])
if test x"${install_complex_avx2_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[1],[can use complex_avx2_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_BGP_KERNEL],[test x"$install_real_bgp" = x"yes"]) AM_CONDITIONAL([WITH_REAL_BGP_KERNEL],[test x"$install_real_bgp" = x"yes"])
if test x"${install_real_bgp}" = x"yes" ; then if test x"${install_real_bgp}" = x"yes" ; then
AC_DEFINE([WITH_REAL_BGP_KERNEL],[1],[can use real BGP kernel]) AC_DEFINE([WITH_REAL_BGP_KERNEL],[1],[can use real BGP kernel])
...@@ -840,14 +946,14 @@ mkdir -p test/shared_sources ...@@ -840,14 +946,14 @@ mkdir -p test/shared_sources
grep -h "^ *!c>" $srcdir/test/shared_sources/*.F90 | sed 's/^ *!c>//;' > test/shared_sources/generated.h || exit 1 grep -h "^ *!c>" $srcdir/test/shared_sources/*.F90 | sed 's/^ *!c>//;' > test/shared_sources/generated.h || exit 1
if test "${can_compile_avx}" = "no" ; then if test "${can_compile_avx}" = "no" ; then
if test x"${want_avx}" = x"yes" ; then # if test x"${want_avx}" = x"yes" ; then
AC_MSG_WARN([Could not compile AVX instructions]) AC_MSG_WARN([Could not compile AVX instructions])
fi # fi
fi fi
if test "${can_compile_avx2}" = "no" ; then if test "${can_compile_avx2}" = "no" ; then
if test x"${want_avx}" = x"yes" ; then # if test x"${want_avx2}" = x"yes" ; then
AC_MSG_WARN([Could not compile AVX2 instructions]) AC_MSG_WARN([Could not compile AVX2 instructions])
fi # fi
fi fi
if test "${can_compile_sse}" = "no" ; then if test "${can_compile_sse}" = "no" ; then
......
...@@ -3,21 +3,30 @@ ...@@ -3,21 +3,30 @@
#define ELPA2_REAL_KERNEL_BGP 3 #define ELPA2_REAL_KERNEL_BGP 3
#define ELPA2_REAL_KERNEL_BGQ 4 #define ELPA2_REAL_KERNEL_BGQ 4
#define ELPA2_REAL_KERNEL_SSE 5 #define ELPA2_REAL_KERNEL_SSE 5
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 6 #define ELPA2_REAL_KERNEL_SSE_BLOCK2 6
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 7 #define ELPA2_REAL_KERNEL_SSE_BLOCK4 7
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 8 #define ELPA2_REAL_KERNEL_SSE_BLOCK6 8
#define ELPA2_REAL_KERNEL_GPU 9 #define ELPA2_REAL_KERNEL_AVX_BLOCK2 9
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 10
#define ELPA2_NUMBER_OF_REAL_KERNELS 9 #define ELPA2_REAL_KERNEL_AVX_BLOCK6 11
#define ELPA2_REAL_KERNEL_AVX2_BLOCK2 12
#define ELPA2_REAL_KERNEL_AVX2_BLOCK4 13
#define ELPA2_REAL_KERNEL_AVX2_BLOCK6 14
#define ELPA2_REAL_KERNEL_GPU 15
#define ELPA2_NUMBER_OF_REAL_KERNELS 15
#define ELPA2_COMPLEX_KERNEL_GENERIC 1 #define ELPA2_COMPLEX_KERNEL_GENERIC 1
#define ELPA2_COMPLEX_KERNEL_GENERIC_SIMPLE 2 #define ELPA2_COMPLEX_KERNEL_GENERIC_SIMPLE 2
#define ELPA2_COMPLEX_KERNEL_BGP 3 #define ELPA2_COMPLEX_KERNEL_BGP 3
#define ELPA2_COMPLEX_KERNEL_BGQ 4 #define ELPA2_COMPLEX_KERNEL_BGQ 4
#define ELPA2_COMPLEX_KERNEL_SSE 5 #define ELPA2_COMPLEX_KERNEL_SSE 5
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 6 #define ELPA2_COMPLEX_KERNEL_SSE_BLOCK1 6
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 7 #define ELPA2_COMPLEX_KERNEL_SSE_BLOCK2 7
#define ELPA2_COMPLEX_KERNEL_GPU 8 #define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 8
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 9
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK1 10
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2 11
#define ELPA2_COMPLEX_KERNEL_GPU 12
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 8 #define ELPA2_NUMBER_OF_COMPLEX_KERNELS 12
...@@ -59,15 +59,14 @@ ...@@ -59,15 +59,14 @@
// Author: Alexander Heinecke (alexander.heinecke@mytum.de) // Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// -------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <complex> #include <complex>
#include <x86intrin.h> #include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) #define __forceinline __attribute__((always_inline))
#ifdef __USE_AVX128__ #ifdef HAVE_AVX2
#undef __AVX__
#endif
#ifdef __FMA4__ #ifdef __FMA4__
#define __ELPA_USE_FMA__ #define __ELPA_USE_FMA__
...@@ -81,20 +80,15 @@ ...@@ -81,20 +80,15 @@
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c) #define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
#endif #endif
#endif
extern "C" { extern "C" {
//Forward declaration //Forward declaration
#ifdef __AVX__
static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
#else
static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
#endif
#if 0 #if 0
static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq) static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{ {
...@@ -145,7 +139,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex<double>* ...@@ -145,7 +139,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex<double>*
} }
#endif // if 0 #endif // if 0
void single_hh_trafo_complex_sse_avx_1hv_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq) void single_hh_trafo_complex_avx_avx2_1hv_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq)
{ {
int i; int i;
int nb = *pnb; int nb = *pnb;
...@@ -153,7 +147,6 @@ void single_hh_trafo_complex_sse_avx_1hv_(std::complex<double>* q, std::complex< ...@@ -153,7 +147,6 @@ void single_hh_trafo_complex_sse_avx_1hv_(std::complex<double>* q, std::complex<
int ldq = *pldq; int ldq = *pldq;
//int ldh = *pldh; //int ldh = *pldh;
#ifdef __AVX__
for (i = 0; i < nq-8; i+=12) for (i = 0; i < nq-8; i+=12)
{ {
hh_trafo_complex_kernel_12_AVX_1hv(&q[i], hh, nb, ldq); hh_trafo_complex_kernel_12_AVX_1hv(&q[i], hh, nb, ldq);
...@@ -166,23 +159,8 @@ void single_hh_trafo_complex_sse_avx_1hv_(std::complex<double>* q, std::complex< ...@@ -166,23 +159,8 @@ void single_hh_trafo_complex_sse_avx_1hv_(std::complex<double>* q, std::complex<
{ {
hh_trafo_complex_kernel_4_AVX_1hv(&q[i], hh, nb, ldq); hh_trafo_complex_kernel_4_AVX_1hv(&q[i], hh, nb, ldq);
} }
#else
for (i = 0; i < nq-4; i+=6)
{
hh_trafo_complex_kernel_6_SSE_1hv(&q[i], hh, nb, ldq);
}
if (nq-i > 2)
{
hh_trafo_complex_kernel_4_SSE_1hv(&q[i], hh, nb, ldq);
}
else if (nq-i > 0)
{
hh_trafo_complex_kernel_2_SSE_1hv(&q[i], hh, nb, ldq);
}
#endif
} }
#ifdef __AVX__
static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq) static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
...@@ -207,7 +185,7 @@ void single_hh_trafo_complex_sse_avx_1hv_(std::complex<double>* q, std::complex< ...@@ -207,7 +185,7 @@ void single_hh_trafo_complex_sse_avx_1hv_(std::complex<double>* q, std::complex<
{ {
h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]); h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]); h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]);
#ifndef __ELPA_USE_FMA__ #ifndef __ELPA_USE_FMA__
// conjugate // conjugate
h1_imag = _mm256_xor_pd(h1_imag, sign); h1_imag = _mm256_xor_pd(h1_imag, sign);
#endif #endif
...@@ -345,7 +323,7 @@ void single_hh_trafo_complex_sse_avx_1hv_(std::complex<double>* q, std::complex< ...@@ -345,7 +323,7 @@ void single_hh_trafo_complex_sse_avx_1hv_(std::complex<double>* q, std::complex<
q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#endif #endif
tmp3 = _mm256_mul_pd(h1_imag, x3); tmp3 = _mm256_mul_pd(h1_imag, x3);
#ifdef __ELPA_USE_FMA__ #ifdef __ELPA_USE_FMA__
q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else #else
q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
...@@ -400,7 +378,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex<double> ...@@ -400,7 +378,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex<double>
{ {
h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]); h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]); h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]);
#ifndef __ELPA_USE_FMA__ #ifndef __ELPA_USE_FMA__
// conjugate // conjugate
h1_imag = _mm256_xor_pd(h1_imag, sign); h1_imag = _mm256_xor_pd(h1_imag, sign);
#endif #endif
...@@ -500,13 +478,13 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex<double> ...@@ -500,13 +478,13 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex<double>
tmp2 = _mm256_mul_pd(h1_imag, x2); tmp2 = _mm256_mul_pd(h1_imag, x2);
#ifdef __ELPA_USE_FMA__ #ifdef __ELPA_USE_FMA__
q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else #else
q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#endif #endif
tmp3 = _mm256_mul_pd(h1_imag, x3); tmp3 = _mm256_mul_pd(h1_imag, x3);
#ifdef __ELPA_USE_FMA__ #ifdef __ELPA_USE_FMA__
q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else #else
q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#endif #endif
tmp4 = _mm256_mul_pd(h1_imag, x4); tmp4 = _mm256_mul_pd(h1_imag, x4);
...@@ -617,441 +595,4 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(std::complex<double> ...@@ -617,441 +595,4 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(std::complex<double>
_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2); _mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
} }
} }
#else
static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
__m128d x1, x2, x3, x4, x5, x6;
__m128d q1, q2, q3, q4, q5, q6;
__m128d h1_real, h1_imag;
__m128d tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
int i=0;
__m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
x1 = _mm_load_pd(&q_dbl[0]);
x2 = _mm_load_pd(&q_dbl[2]);
x3 = _mm_load_pd(&q_dbl[4]);
x4 = _mm_load_pd(&q_dbl[6]);
x5 = _mm_load_pd(&q_dbl[8]);
x6 = _mm_load_pd(&q_dbl[10]);
for (i = 1; i < nb; i++)
{
h1_real = _mm_loaddup_pd(&hh_dbl[i*2]);
h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]);
#ifndef __ELPA_USE_FMA__
// conjugate
h1_imag = _mm_xor_pd(h1_imag, sign);
#endif
q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
q5 = _mm_load_pd(&q_dbl[(2*i*ldq)+8]);
q6 = _mm_load_pd(&q_dbl[(2*i*ldq)+10]);