Commit 09d13e2b authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'master' into ELPA_GPU

parents 62fe6edc 0d256c1b
...@@ -12,6 +12,7 @@ autom4te.cache ...@@ -12,6 +12,7 @@ autom4te.cache
compile compile
config.guess config.guess
config.h.in config.h.in
config.h.in~
config.sub config.sub
configure configure
depcomp depcomp
...@@ -19,3 +20,8 @@ install-sh ...@@ -19,3 +20,8 @@ install-sh
ltmain.sh ltmain.sh
missing missing
test-driver test-driver
m4/libtool.m4
m4/ltoptions.m4
m4/ltsugar.m4
m4/ltversion.m4
m4/lt~obsolete.m4
jobs: jobs:
script: ./autogen.sh && ./configure && make && make check TEST_FLAGS='1500 50 16' script:
- export LANG=C
- module load impi intel gcc mkl autotools
- ./autogen.sh
- ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" SCALAPACK_FCFLAGS="-L/afs/@cell/common/soft/intel/ics2015/15.0/mkl/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64"
- make -j 8
- make check TEST_FLAGS='1500 50 16'
...@@ -82,18 +82,18 @@ if WITH_REAL_BGQ_KERNEL ...@@ -82,18 +82,18 @@ if WITH_REAL_BGQ_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90 libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90
endif endif
if WITH_REAL_SSE_KERNEL if WITH_REAL_SSE_ASSEMBLY_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s
if WANT_SINGLE_PRECISION_REAL if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64_single_precision.s libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64_single_precision.s
endif endif
else else
if WITH_COMPLEX_SSE_KERNEL if WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s
if WANT_SINGLE_PRECISION_COMPLEX if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64_single_precision.s libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64_single_precision.s
endif endif
endif endif
endif endif
......
...@@ -202,55 +202,77 @@ if test x"${with_ftimings}" = x"yes"; then ...@@ -202,55 +202,77 @@ if test x"${with_ftimings}" = x"yes"; then
fi fi
AM_CONDITIONAL([HAVE_DETAILED_TIMINGS],[test x"$with_ftimings" = x"yes"]) AM_CONDITIONAL([HAVE_DETAILED_TIMINGS],[test x"$with_ftimings" = x"yes"])
AC_MSG_CHECKING(whether double-precision SSE assembler kernel can be compiled) AC_MSG_CHECKING(whether double-precision SSE assembly kernel can be compiled)
$CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s -o test.o 2>/dev/null $CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s -o test.o 2>/dev/null
if test "$?" == 0; then if test "$?" == 0; then
can_compile_sse=yes can_compile_sse_assembly=yes
install_real_sse=yes install_real_sse_assembly=yes
install_complex_sse_assembly=yes
else
can_compile_sse_assembly=no
install_real_sse_assembly=no
install_complex_sse_assembly=no
fi
rm -f ./test.o
AC_MSG_RESULT([${can_compile_sse_assembly}])
if test x"${want_single_precision}" = x"yes" ; then
AC_MSG_CHECKING(whether single-precision SSE assembly kernel can be compiled)
$CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64_single_precision.s -o test.o 2>/dev/null
if test "$?" == 0; then
can_compile_sse_assembly=yes
install_real_sse_assembly=yes
install_complex_sse_assembly=yes
else
can_compile_sse_assembly=no
install_real_sse_assembly=no
install_complex_sse_assembly=no
fi
rm -f ./test.o
AC_MSG_RESULT([${can_compile_sse_assembly}])
if test x"${can_compile_sse_assembly}" = x"no" ; then
AC_MSG_WARN([Cannot compile single-precision SSE assembly kernel: disabling SSE assembly kernels alltogether])
fi
fi
dnl check whether on can compile with sse-gcc intrinsics
AC_MSG_CHECKING(whether we can compile SSE with gcc intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
double* q;
__m128d h1 = _mm_loaddup_pd(q);
return 0;
}
])],
[can_compile_sse_intrinsics=yes],
[can_compile_sse_intrinsics=no]
)
AC_MSG_RESULT([${can_compile_sse_intrinsics}])
if test "${can_compile_sse_intrinsics}" = "yes"; then
install_real_sse_intrinsics=yes
install_real_sse_block2=yes install_real_sse_block2=yes
install_real_sse_block4=yes install_real_sse_block4=yes
install_real_sse_block6=yes install_real_sse_block6=yes
install_complex_sse=yes install_complex_sse_intrinsics=yes
install_complex_sse_block1=yes install_complex_sse_block1=yes
install_complex_sse_block2=yes install_complex_sse_block2=yes
else else
can_compile_sse=no install_real_sse_intrinsics=no
install_real_sse=no
install_real_sse_block2=no install_real_sse_block2=no
install_real_sse_block4=no install_real_sse_block4=no
install_real_sse_block6=no install_real_sse_block6=no
install_complex_sse=no install_complex_sse_intrinsics=no
install_complex_sse_block1=no install_complex_sse_block1=no
install_complex_sse_block2=no install_complex_sse_block2=no
fi fi
rm -f ./test.o
AC_MSG_RESULT([${can_compile_sse}])
if test x"${want_single_precision}" = x"yes" ; then
AC_MSG_CHECKING(whether single-precision SSE assembler kernel can be compiled)
$CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s -o test.o 2>/dev/null
if test "$?" == 0; then
can_compile_sse=yes
install_real_sse=yes
install_complex_sse=yes
else
can_compile_sse=no
install_real_sse=no
install_complex_sse=no
fi
rm -f ./test.o
AC_MSG_RESULT([${can_compile_sse}])
if test x"${can_compile_sse}" = x"no" ; then
AC_MSG_WARN([Cannot compile single-precision SSE kernel: disabling SSE kernels alltogether])
fi
fi
dnl check whether one can compile with avx - gcc intrinsics dnl check whether one can compile with avx - gcc intrinsics
dnl first pass: try with specified CFLAGS and CXXFLAGS dnl first pass: try with specified CFLAGS and CXXFLAGS
...@@ -356,10 +378,16 @@ else ...@@ -356,10 +378,16 @@ else
install_complex_avx2_block1=no install_complex_avx2_block1=no
install_complex_avx2_block2=no install_complex_avx2_block2=no
fi fi
AM_CONDITIONAL([HAVE_SSE],[test x"$can_compile_sse" = x"yes"])
if test x"${can_compile_sse}" = x"yes" ; then AM_CONDITIONAL([HAVE_SSE_ASSEMBLY],[test x"$can_compile_sse_assembly" = x"yes"])
AC_DEFINE([HAVE_SSE],[1],[SSE is supported on this CPU]) if test x"${can_compile_sse_assembly}" = x"yes" ; then
AC_DEFINE([HAVE_SSE_ASSEMBLY],[1],[assembly SSE is supported on this CPU])
fi
AM_CONDITIONAL([HAVE_SSE_INTRINSICS],[test x"$can_compile_sse_intrinsics" = x"yes"])
if test x"${can_compile_sse_intrinsics}" = x"yes" ; then
AC_DEFINE([HAVE_SSE_INTRINSICS],[1],[gcc intrinsics SSE is supported on this CPU])
fi fi
AM_CONDITIONAL([HAVE_AVX],[test x"$can_compile_avx" = x"yes"]) AM_CONDITIONAL([HAVE_AVX],[test x"$can_compile_avx" = x"yes"])
if test x"${can_compile_avx}" = x"yes" ; then if test x"${can_compile_avx}" = x"yes" ; then
AC_DEFINE([HAVE_AVX],[1],[AVX is supported on this CPU]) AC_DEFINE([HAVE_AVX],[1],[AVX is supported on this CPU])
...@@ -671,7 +699,7 @@ dnl generic-simple kernel ...@@ -671,7 +699,7 @@ dnl generic-simple kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-generic-simple-kernel-only],[generic-simple-kernel],[install_real_generic_simple]) DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-generic-simple-kernel-only],[generic-simple-kernel],[install_real_generic_simple])
dnl sse kernel dnl sse kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-kernel-only],[sse-kernel],[install_real_sse]) DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-assembly-kernel-only],[sse-assembly-kernel],[install_real_sse_assembly])
dnl bgp kernel dnl bgp kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgp-kernel-only],[bgp-kernel],[install_real_bgp]) DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgp-kernel-only],[bgp-kernel],[install_real_bgp])
...@@ -706,7 +734,7 @@ dnl generic-simple kernel ...@@ -706,7 +734,7 @@ dnl generic-simple kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-generic-simple-kernel-only],[generic-simple-kernel],[install_complex_generic_simple]) DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-generic-simple-kernel-only],[generic-simple-kernel],[install_complex_generic_simple])
dnl sse kernel dnl sse kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-kernel-only],[sse-kernel],[install_complex_sse]) DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-assembly-kernel-only],[sse-assembly-kernel],[install_complex_sse_assembly])
dnl complex-bqp kernel dnl complex-bqp kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgp-kernel-only],[bgp-kernel],[install_complex_bgp]) DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgp-kernel-only],[bgp-kernel],[install_complex_bgp])
...@@ -757,14 +785,14 @@ if test x"${install_complex_generic_simple}" = x"yes" ; then ...@@ -757,14 +785,14 @@ if test x"${install_complex_generic_simple}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_GENERIC_SIMPLE_KERNEL],[1],[can use complex generic-simple kernel]) AC_DEFINE([WITH_COMPLEX_GENERIC_SIMPLE_KERNEL],[1],[can use complex generic-simple kernel])
fi fi
AM_CONDITIONAL([WITH_REAL_SSE_KERNEL],[test x"$install_real_sse" = x"yes"]) AM_CONDITIONAL([WITH_REAL_SSE_ASSEMBLY_KERNEL],[test x"$install_real_sse_assembly" = x"yes"])
if test x"${install_real_sse}" = x"yes" ; then if test x"${install_real_sse_assembly}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_KERNEL],[1],[can use real SSE kernel]) AC_DEFINE([WITH_REAL_SSE_ASSEMBLY_KERNEL],[1],[can use real SSE assembly kernel])
fi fi
AM_CONDITIONAL([WITH_COMPLEX_SSE_KERNEL],[test x"$install_complex_sse" = x"yes"]) AM_CONDITIONAL([WITH_COMPLEX_SSE_ASSEMBLY_KERNEL],[test x"$install_complex_sse_assembly" = x"yes"])
if test x"${install_complex_sse}" = x"yes" ; then if test x"${install_complex_sse_assembly}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_KERNEL],[1],[can use complex SSE kernel]) AC_DEFINE([WITH_COMPLEX_SSE_ASSEMBLY_KERNEL],[1],[can use complex SSE assembly kernel])
fi fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK2_KERNEL],[test x"$install_real_sse_block2" = x"yes"]) AM_CONDITIONAL([WITH_REAL_SSE_BLOCK2_KERNEL],[test x"$install_real_sse_block2" = x"yes"])
......
...@@ -67,7 +67,7 @@ ...@@ -67,7 +67,7 @@
#define __forceinline __attribute__((always_inline)) #define __forceinline __attribute__((always_inline))
#ifdef HAVE_SSE #ifdef HAVE_SSE_INTRINSICS
#undef __AVX__ #undef __AVX__
#endif #endif
...@@ -78,7 +78,7 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv_double(double comple ...@@ -78,7 +78,7 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv_double(double comple
static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv_double(double complex* q, double complex* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
/* /*
!f>#ifdef HAVE_SSE !f>#ifdef HAVE_SSE_INTRINSICS
!f> interface !f> interface
!f> subroutine single_hh_trafo_complex_sse_1hv_double(q, hh, pnb, pnq, pldq) & !f> subroutine single_hh_trafo_complex_sse_1hv_double(q, hh, pnb, pnq, pldq) &
!f> bind(C, name="single_hh_trafo_complex_sse_1hv_double") !f> bind(C, name="single_hh_trafo_complex_sse_1hv_double")
......
...@@ -67,7 +67,7 @@ ...@@ -67,7 +67,7 @@
#define __forceinline __attribute__((always_inline)) #define __forceinline __attribute__((always_inline))
#ifdef HAVE_SSE #ifdef HAVE_SSE_INTRINSICS
#undef __AVX__ #undef __AVX__
#endif #endif
...@@ -78,7 +78,7 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv_single(complex* q, c ...@@ -78,7 +78,7 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv_single(complex* q, c
static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv_single(complex* q, complex* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv_single(complex* q, complex* hh, int nb, int ldq);
/* /*
!f>#ifdef HAVE_SSE !f>#ifdef HAVE_SSE_INTRINSICS
!f> interface !f> interface
!f> subroutine single_hh_trafo_complex_sse_1hv_single(q, hh, pnb, pnq, pldq) & !f> subroutine single_hh_trafo_complex_sse_1hv_single(q, hh, pnb, pnq, pldq) &
!f> bind(C, name="single_hh_trafo_complex_sse_1hv_single") !f> bind(C, name="single_hh_trafo_complex_sse_1hv_single")
......
...@@ -66,7 +66,7 @@ ...@@ -66,7 +66,7 @@
#define __forceinline __attribute__((always_inline)) #define __forceinline __attribute__((always_inline))
#ifdef HAVE_SSE #ifdef HAVE_SSE_INTRINSICS
#undef __AVX__ #undef __AVX__
#endif #endif
...@@ -78,7 +78,7 @@ static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv_double(double comple ...@@ -78,7 +78,7 @@ static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv_double(double comple
static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s); static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
/* /*
!f>#ifdef HAVE_SSE !f>#ifdef HAVE_SSE_INTRINSICS
!f> interface !f> interface
!f> subroutine double_hh_trafo_complex_sse_2hv_double(q, hh, pnb, pnq, pldq, pldh) & !f> subroutine double_hh_trafo_complex_sse_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_sse_2hv_double") !f> bind(C, name="double_hh_trafo_complex_sse_2hv_double")
......
...@@ -66,7 +66,7 @@ ...@@ -66,7 +66,7 @@
#define __forceinline __attribute__((always_inline)) #define __forceinline __attribute__((always_inline))
#ifdef HAVE_SSE #ifdef HAVE_SSE_INTRINSICS
#undef __AVX__ #undef __AVX__
#endif #endif
...@@ -74,7 +74,7 @@ ...@@ -74,7 +74,7 @@
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(complex* q, complex* hh, int nb, int ldq, int ldh, complex s, complex s1); static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(complex* q, complex* hh, int nb, int ldq, int ldh, complex s, complex s1);
/* /*
!f>#ifdef HAVE_SSE !f>#ifdef HAVE_SSE_INTRINSICS
!f> interface !f> interface
!f> subroutine double_hh_trafo_complex_sse_2hv_single(q, hh, pnb, pnq, pldq, pldh) & !f> subroutine double_hh_trafo_complex_sse_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_sse_2hv_single") !f> bind(C, name="double_hh_trafo_complex_sse_2hv_single")
......
...@@ -67,7 +67,7 @@ ...@@ -67,7 +67,7 @@
#define __forceinline __attribute__((always_inline)) static #define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_SSE #ifdef HAVE_SSE_INTRINSICS
#undef __AVX__ #undef __AVX__
#endif #endif
...@@ -79,7 +79,7 @@ __forceinline void hh_trafo_kernel_12_SSE_2hv_double(double* q, double* hh, int ...@@ -79,7 +79,7 @@ __forceinline void hh_trafo_kernel_12_SSE_2hv_double(double* q, double* hh, int
void double_hh_trafo_real_sse_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); void double_hh_trafo_real_sse_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
/* /*
!f>#ifdef HAVE_SSE !f>#ifdef HAVE_SSE_INTRINSICS
!f> interface !f> interface
!f> subroutine double_hh_trafo_real_sse_2hv_double(q, hh, pnb, pnq, pldq, pldh) & !f> subroutine double_hh_trafo_real_sse_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_real_sse_2hv_double") !f> bind(C, name="double_hh_trafo_real_sse_2hv_double")
......
...@@ -67,7 +67,7 @@ ...@@ -67,7 +67,7 @@
#define __forceinline __attribute__((always_inline)) static #define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_SSE #ifdef HAVE_SSE_INTRINSICS
#undef __AVX__ #undef __AVX__
#endif #endif
...@@ -79,7 +79,7 @@ __forceinline void hh_trafo_kernel_12_SSE_2hv_single(float* q, float* hh, int nb ...@@ -79,7 +79,7 @@ __forceinline void hh_trafo_kernel_12_SSE_2hv_single(float* q, float* hh, int nb
void double_hh_trafo_real_sse_2hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh); void double_hh_trafo_real_sse_2hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
/* /*
!f>#ifdef HAVE_SSE !f>#ifdef HAVE_SSE_INTRINSICS
!f> interface !f> interface
!f> subroutine double_hh_trafo_real_sse_2hv_single(q, hh, pnb, pnq, pldq, pldh) & !f> subroutine double_hh_trafo_real_sse_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_real_sse_2hv_single") !f> bind(C, name="double_hh_trafo_real_sse_2hv_single")
......
...@@ -66,7 +66,7 @@ ...@@ -66,7 +66,7 @@
#define __forceinline __attribute__((always_inline)) static #define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_SSE #ifdef HAVE_SSE_INTRINSICS
#undef __AVX__ #undef __AVX__
#endif #endif
...@@ -78,7 +78,7 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv_double(double* q, double* hh, int n ...@@ -78,7 +78,7 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv_double(double* q, double* hh, int n
void quad_hh_trafo_real_sse_4hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); void quad_hh_trafo_real_sse_4hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
/* /*
!f>#ifdef HAVE_SSE !f>#ifdef HAVE_SSE_INTRINSICS
!f> interface !f> interface
!f> subroutine quad_hh_trafo_real_sse_4hv_double(q, hh, pnb, pnq, pldq, pldh) & !f> subroutine quad_hh_trafo_real_sse_4hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="quad_hh_trafo_real_sse_4hv_double") !f> bind(C, name="quad_hh_trafo_real_sse_4hv_double")
......
...@@ -66,7 +66,7 @@ ...@@ -66,7 +66,7 @@
#define __forceinline __attribute__((always_inline)) static #define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_SSE #ifdef HAVE_SSE_INTRINSICS
#undef __AVX__ #undef __AVX__
#endif #endif
__forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4); __forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
...@@ -76,7 +76,7 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb ...@@ -76,7 +76,7 @@ __forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb
void quad_hh_trafo_real_sse_4hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh); void quad_hh_trafo_real_sse_4hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
/* /*
!f>#ifdef HAVE_SSE !f>#ifdef HAVE_SSE_INTRINSICS
!f> interface !f> interface
!f> subroutine quad_hh_trafo_real_sse_4hv_single(q, hh, pnb, pnq, pldq, pldh) & !f> subroutine quad_hh_trafo_real_sse_4hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="quad_hh_trafo_real_sse_4hv_single") !f> bind(C, name="quad_hh_trafo_real_sse_4hv_single")
......
...@@ -66,7 +66,7 @@ ...@@ -66,7 +66,7 @@
#define __forceinline __attribute__((always_inline)) static #define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_SSE #ifdef HAVE_SSE_INTRINSICS
#undef __AVX__ #undef __AVX__
#endif #endif
...@@ -76,7 +76,7 @@ static void hh_trafo_kernel_4_SSE_6hv_double(double* q, double* hh, int nb, int ...@@ -76,7 +76,7 @@ static void hh_trafo_kernel_4_SSE_6hv_double(double* q, double* hh, int nb, int
void hexa_hh_trafo_real_sse_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); void hexa_hh_trafo_real_sse_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
/* /*
!f>#ifdef HAVE_SSE !f>#ifdef HAVE_SSE_INTRINSICS
!f> interface !f> interface
!f> subroutine hexa_hh_trafo_real_sse_6hv_double(q, hh, pnb, pnq, pldq, pldh) & !f> subroutine hexa_hh_trafo_real_sse_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_sse_6hv_double") !f> bind(C, name="hexa_hh_trafo_real_sse_6hv_double")
......
...@@ -66,7 +66,7 @@ ...@@ -66,7 +66,7 @@
#define __forceinline __attribute__((always_inline)) static #define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_SSE #ifdef HAVE_SSE_INTRINSICS
#undef __AVX__ #undef __AVX__
#endif #endif
...@@ -80,7 +80,7 @@ static void hh_trafo_kernel_8_SSE_6hv_single(float* q, float* hh, int nb, int ld ...@@ -80,7 +80,7 @@ static void hh_trafo_kernel_8_SSE_6hv_single(float* q, float* hh, int nb, int ld
void hexa_hh_trafo_real_sse_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh); void hexa_hh_trafo_real_sse_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
/* /*
!f>#ifdef HAVE_SSE !f>#ifdef HAVE_SSE_INTRINSICS
!f> interface !f> interface
!f> subroutine hexa_hh_trafo_real_sse_6hv_single(q, hh, pnb, pnq, pldq, pldh) & !f> subroutine hexa_hh_trafo_real_sse_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_sse_6hv_single") !f> bind(C, name="hexa_hh_trafo_real_sse_6hv_single")
......
...@@ -132,7 +132,7 @@ module ELPA2_utilities ...@@ -132,7 +132,7 @@ module ELPA2_utilities
#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL #ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE
#endif #endif
#ifdef WITH_REAL_SSE_KERNEL #ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE
#endif #endif
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL #ifdef WITH_REAL_AVX_BLOCK2_KERNEL
...@@ -168,7 +168,7 @@ module ELPA2_utilities ...@@ -168,7 +168,7 @@ module ELPA2_utilities
#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL #ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE
#endif #endif
#ifdef WITH_REAL_SSE_KERNEL #ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE
#endif #endif
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL #ifdef WITH_REAL_AVX_BLOCK2_KERNEL
...@@ -238,7 +238,7 @@ module ELPA2_utilities ...@@ -238,7 +238,7 @@ module ELPA2_utilities
#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL #ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE
#endif #endif
#ifdef WITH_COMPLEX_SSE_KERNEL #ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE
#endif #endif
#ifdef WITH_COMPLEX_AVX1_BLOCK1_KERNEL #ifdef WITH_COMPLEX_AVX1_BLOCK1_KERNEL
...@@ -267,7 +267,7 @@ module ELPA2_utilities ...@@ -267,7 +267,7 @@ module ELPA2_utilities
#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL #ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE
#endif #endif
#ifdef WITH_COMPLEX_SSE_KERNEL #ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE
#endif #endif
#ifdef WITH_COMPLEX_AVX1_BLOCK1_KERNEL #ifdef WITH_COMPLEX_AVX1_BLOCK1_KERNEL
...@@ -321,7 +321,7 @@ module ELPA2_utilities ...@@ -321,7 +321,7 @@ module ELPA2_utilities
#else #else
,0 & ,0 &
#endif #endif
#if WITH_REAL_SSE_KERNEL #if WITH_REAL_SSE_ASSEMBLY_KERNEL
,1 & ,1 &
#else #else
,0 & ,0 &
...@@ -402,7 +402,7 @@ module ELPA2_utilities ...@@ -402,7 +402,7 @@ module ELPA2_utilities
#else #else
,0 &