Commit 896388e9 authored by Andreas Marek's avatar Andreas Marek

Additional configure check for gcc SSE intrinsics

It turned out that if a CPU supports SSE the already existing
test for SSE assembly instructions always passes.
However, the compilation of gcc SSE intrinic instructions might
nevertheless fail if gcc is not called with one of the options
"-msse3", "-msse4" , "-msse4.1", "-msse4.2", "-mavx", or "-mavx2"!

Obviously gcc does still not consider SSE as a standard on X86_64
Intel CPUs.

An additional configure test has been introduced, which test for
gcc intrinsic sse instructions. If this test fails, the corresponding
kernels are switched off.
parent 7423daeb
......@@ -73,10 +73,10 @@ if WITH_REAL_BGQ_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90
endif
if WITH_REAL_SSE_KERNEL
if WITH_REAL_SSE_ASSEMBLY_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
else
if WITH_COMPLEX_SSE_KERNEL
if WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
endif
endif
......
......@@ -194,35 +194,57 @@ if test x"${with_ftimings}" = x"yes"; then
fi
AM_CONDITIONAL([HAVE_DETAILED_TIMINGS],[test x"$with_ftimings" = x"yes"])
AC_MSG_CHECKING(whether SSE assembler kernel can be compiled)
AC_MSG_CHECKING(whether SSE assembly kernel can be compiled)
$CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s -o test.o 2>/dev/null
if test "$?" == 0; then
can_compile_sse=yes
install_real_sse=yes
can_compile_sse_assembly=yes
install_real_sse_assembly=yes
install_complex_sse_assembly=yes
else
can_compile_sse_assembly=no
install_real_sse_assembly=no
install_complex_sse_assembly=no
fi
rm -f ./test.o
AC_MSG_RESULT([${can_compile_sse_assembly}])
dnl check whether on can compile with sse-gcc intrinsics
AC_MSG_CHECKING(whether we can compile SSE with gcc intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
double* q;
__m128d h1 = _mm_loaddup_pd(q);
return 0;
}
])],
[can_compile_sse_intrinsics=yes],
[can_compile_sse_intrinsics=no]
)
AC_MSG_RESULT([${can_compile_sse_intrinsics}])
if test "${can_compile_sse_intrinsics}" = "yes"; then
install_real_sse_intrinsics=yes
install_real_sse_block2=yes
install_real_sse_block4=yes
install_real_sse_block6=yes
install_complex_sse=yes
install_complex_sse_intrinsics=yes
install_complex_sse_block1=yes
install_complex_sse_block2=yes
else
can_compile_sse=no
install_real_sse=no
install_real_sse_intrinsics=no
install_real_sse_block2=no
install_real_sse_block4=no
install_real_sse_block6=no
install_complex_sse=no
install_complex_sse_intrinsics=no
install_complex_sse_block1=no
install_complex_sse_block2=no
fi
rm -f ./test.o
AC_MSG_RESULT([${can_compile_sse}])
dnl check whether one can compile with avx - gcc intrinsics
dnl first pass: try with specified CFLAGS and CXXFLAGS
......@@ -328,10 +350,16 @@ else
install_complex_avx2_block1=no
install_complex_avx2_block2=no
fi
AM_CONDITIONAL([HAVE_SSE],[test x"$can_compile_sse" = x"yes"])
if test x"${can_compile_sse}" = x"yes" ; then
AC_DEFINE([HAVE_SSE],[1],[SSE is supported on this CPU])
AM_CONDITIONAL([HAVE_SSE_ASSEMBLY],[test x"$can_compile_sse_assembly" = x"yes"])
if test x"${can_compile_sse_assembly}" = x"yes" ; then
AC_DEFINE([HAVE_SSE_ASSEMBLY],[1],[assembly SSE is supported on this CPU])
fi
AM_CONDITIONAL([HAVE_SSE_INTRINSICS],[test x"$can_compile_sse_intrinsics" = x"yes"])
if test x"${can_compile_sse_intrinsics}" = x"yes" ; then
AC_DEFINE([HAVE_SSE_INTRINSICS],[1],[gcc intrinsics SSE is supported on this CPU])
fi
AM_CONDITIONAL([HAVE_AVX],[test x"$can_compile_avx" = x"yes"])
if test x"${can_compile_avx}" = x"yes" ; then
AC_DEFINE([HAVE_AVX],[1],[AVX is supported on this CPU])
......@@ -590,7 +618,7 @@ dnl real kernels
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-generic-simple-kernel-only],[generic-simple-kernel],[install_real_generic_simple])
dnl sse kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-kernel-only],[sse-kernel],[install_real_sse])
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-assembly-kernel-only],[sse-assembly-kernel],[install_real_sse_assembly])
dnl bgp kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgp-kernel-only],[bgp-kernel],[install_real_bgp])
......@@ -629,7 +657,7 @@ dnl complex kernels
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-generic-simple-kernel-only],[generic-simple-kernel],[install_complex_generic_simple])
dnl sse kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-kernel-only],[sse-kernel],[install_complex_sse])
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-assembly-kernel-only],[sse-assembly-kernel],[install_complex_sse_assembly])
dnl complex-bqp kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgp-kernel-only],[bgp-kernel],[install_complex_bgp])
......@@ -675,14 +703,14 @@ if test x"${install_complex_generic_simple}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_GENERIC_SIMPLE_KERNEL],[1],[can use complex generic-simple kernel])
fi
AM_CONDITIONAL([WITH_REAL_SSE_KERNEL],[test x"$install_real_sse" = x"yes"])
if test x"${install_real_sse}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_KERNEL],[1],[can use real SSE kernel])
AM_CONDITIONAL([WITH_REAL_SSE_ASSEMBLY_KERNEL],[test x"$install_real_sse_assembly" = x"yes"])
if test x"${install_real_sse_assembly}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_ASSEMBLY_KERNEL],[1],[can use real SSE assembly kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_SSE_KERNEL],[test x"$install_complex_sse" = x"yes"])
if test x"${install_complex_sse}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_KERNEL],[1],[can use complex SSE kernel])
AM_CONDITIONAL([WITH_COMPLEX_SSE_ASSEMBLY_KERNEL],[test x"$install_complex_sse_assembly" = x"yes"])
if test x"${install_complex_sse_assembly}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_ASSEMBLY_KERNEL],[1],[can use complex SSE assembly kernel])
fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK2_KERNEL],[test x"$install_real_sse_block2" = x"yes"])
......
......@@ -67,7 +67,7 @@
#define __forceinline __attribute__((always_inline))
#ifdef HAVE_SSE
#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif
......@@ -78,7 +78,7 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(double complex* q, d
static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(double complex* q, double complex* hh, int nb, int ldq);
/*
!f>#ifdef HAVE_SSE
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine single_hh_trafo_complex_sse_1hv(q, hh, pnb, pnq, pldq) bind(C, name="single_hh_trafo_complex_sse_1hv")
!f> use, intrinsic :: iso_c_binding
......
......@@ -66,7 +66,7 @@
#define __forceinline __attribute__((always_inline))
#ifdef HAVE_SSE
#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif
......@@ -78,7 +78,7 @@ static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(double complex* q, d
static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
/*
!f>#ifdef HAVE_SSE
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine double_hh_trafo_complex_sse_2hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="double_hh_trafo_complex_sse_2hv")
!f> use, intrinsic :: iso_c_binding
......
......@@ -67,7 +67,7 @@
#define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_SSE
#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif
......@@ -77,7 +77,7 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
__forceinline void hh_trafo_kernel_12_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
/*
!f>#ifdef HAVE_SSE
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine double_hh_trafo_real_sse_2hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="double_hh_trafo_real_sse_2hv")
!f> use, intrinsic :: iso_c_binding
......
......@@ -66,7 +66,7 @@
#define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_SSE
#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif
......@@ -76,7 +76,7 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int
__forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
/*
!f>#ifdef HAVE_SSE
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine quad_hh_trafo_real_sse_4hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="quad_hh_trafo_real_sse_4hv")
!f> use, intrinsic :: iso_c_binding
......
......@@ -66,7 +66,7 @@
#define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_SSE
#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif
......@@ -75,7 +75,7 @@ static void hh_trafo_kernel_2_SSE_6hv(double* q, double* hh, int nb, int ldq, in
static void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
/*
!f>#ifdef HAVE_SSE
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine hexa_hh_trafo_real_sse_6hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="hexa_hh_trafo_real_sse_6hv")
!f> use, intrinsic :: iso_c_binding
......
......@@ -189,7 +189,7 @@ module ELPA2_utilities
#else
,0 &
#endif
#if WITH_REAL_SSE_KERNEL
#if WITH_REAL_SSE_ASSEMBLY_KERNEL
,1 &
#else
,0 &
......@@ -267,7 +267,7 @@ module ELPA2_utilities
#else
,0 &
#endif
#if WITH_COMPLEX_SSE_KERNEL
#if WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
,1 &
#else
,0 &
......
......@@ -35,7 +35,7 @@ module compute_hh_trafo_complex
use timings
#endif
#if defined(HAVE_AVX) || defined(HAVE_SSE)
#if defined(HAVE_AVX) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
use kernel_interfaces
#endif
implicit none
......@@ -218,7 +218,7 @@ module compute_hh_trafo_complex
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_GENERIC_KERNEL */
#if defined(WITH_COMPLEX_SSE_KERNEL)
#if defined(WITH_COMPLEX_SSE_ASSEMBLY_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
......@@ -235,7 +235,7 @@ module compute_hh_trafo_complex
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_KERNEL */
#endif /* WITH_COMPLEX_SSE_ASSEMBLY_KERNEL */
!#if defined(WITH_AVX_SANDYBRIDGE)
......
......@@ -46,7 +46,7 @@ module compute_hh_trafo_real
use timings
#endif
#if defined(HAVE_AVX) || defined(HAVE_SSE)
#if defined(HAVE_AVX) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
use kernel_interfaces
#endif
implicit none
......@@ -194,7 +194,7 @@ module compute_hh_trafo_real
#endif /* WITH_REAL_GENERIC_SIMPLE_KERNEL */
#if defined(WITH_REAL_SSE_KERNEL)
#if defined(WITH_REAL_SSE_ASSEMBLY_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
......@@ -212,7 +212,7 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_SSE_KERNEL */
#endif /* WITH_REAL_SSE_ASSEMBLY_KERNEL */
#if defined(WITH_REAL_SSE_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment