Commit a2bd28d3 authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'Skylake-master' into master_pre_stage

parents 8596c33e 6e5fd307
...@@ -853,12 +853,12 @@ if test x"${need_avx512}" = x"yes"; then ...@@ -853,12 +853,12 @@ if test x"${need_avx512}" = x"yes"; then
AC_DEFINE([HAVE_AVX512],[1],[AVX512 is supported on this CPU]) AC_DEFINE([HAVE_AVX512],[1],[AVX512 is supported on this CPU])
if test x"$can_compile_avx512" = x"yes"; then if test x"$can_compile_avx512" = x"yes"; then
AC_MSG_CHECKING([whether we compile for Xeon PHI or Xeon]) AC_MSG_CHECKING([whether we compile for Xeon])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([ AC_RUN_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h> #include <x86intrin.h>
int main(int argc, char **argv){ int main(int argc, char **argv){
__m512d sign; __m512d sign;
__m512d h1; __m512d h1_real;
__m512d x1 = _mm512_xor_pd(h1_real, sign); __m512d x1 = _mm512_xor_pd(h1_real, sign);
return 0; return 0;
...@@ -867,19 +867,37 @@ if test x"${need_avx512}" = x"yes"; then ...@@ -867,19 +867,37 @@ if test x"${need_avx512}" = x"yes"; then
[can_compile_avx512_xeon=yes], [can_compile_avx512_xeon=yes],
[can_compile_avx512_xeon=no] [can_compile_avx512_xeon=no]
) )
AC_COMPILE_IFELSE([AC_LANG_SOURCE([ AC_MSG_RESULT([${can_compile_avx512_xeon}])
AC_MSG_CHECKING([whether we compile for Xeon PHI])
AC_RUN_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h> #include <x86intrin.h>
int main(int argc, char **argv){ int main(int argc, char **argv){
__m512d sign; __m512d sign;
__m512d h1; __m512d h1;
__m512d h2_real;
__m512d x1 = (__mm512d) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign); __m512d x1 = (__m512d) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
return 0; return 0;
} }
])], ])],
[can_compile_avx512_xeon_phi=yes], [can_compile_avx512_xeon_phi=yes],
[can_compile_avx512_xeon_phi=no] [can_compile_avx512_xeon_phi=no]
) )
AC_MSG_RESULT([${can_compile_avx512_xeon_phi}])
# this is needed for the intel compiler
if test x"$can_compile_avx512_xeon" = x"yes" ; then
if test x"$can_compile_avx512_xeon_phi" = x"yes" ; then
# we want only one to be true; this is ugly but could not come up with a better way
grep Phi /proc/cpuinfo > /dev/null
if test x"$?" = x"0" ; then
echo "Xeon PHI found ... disabling AVX512 Xeon"
can_compile_avx512_xeon=no
fi
fi
fi
if test x"$can_compile_avx512_xeon" = x"yes"; then if test x"$can_compile_avx512_xeon" = x"yes"; then
AC_DEFINE([HAVE_AVX512_XEON],[1],[AVX512 for Xeon is supported on this CPU]) AC_DEFINE([HAVE_AVX512_XEON],[1],[AVX512 for Xeon is supported on this CPU])
else else
......
XEON_PHI/ This file is part of ELPA. // This file is part of ELPA.
// //
// The ELPA library was originally created by the ELPA consortium, // The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations: // consisting of the following organizations:
......
...@@ -345,6 +345,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -345,6 +345,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign); h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
#endif #endif
#endif
#ifdef HAVE_AVX512_XEON #ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL) #if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
...@@ -364,6 +365,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -364,6 +365,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign); h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
#endif #endif
#endif #endif
#ifdef HAVE_AVX512_XEON #ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL) #if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau2, sign); h1 = _AVX512_XOR(tau2, sign);
...@@ -441,7 +443,6 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -441,7 +443,6 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
q4 = _AVX512_LOAD(&q[(nb*ldq)+3*offset]); q4 = _AVX512_LOAD(&q[(nb*ldq)+3*offset]);
q4 = _AVX512_FMA(x4, h1, q4); q4 = _AVX512_FMA(x4, h1, q4);
_AVX512_STORE(&q[(nb*ldq)+3*offset],q4); _AVX512_STORE(&q[(nb*ldq)+3*offset],q4);
>>>>>>> Skylake
} }
...@@ -531,6 +532,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -531,6 +532,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign); h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
#endif #endif
#endif #endif
#ifdef HAVE_AVX512_XEON #ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL) #if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau1, sign); h1 = _AVX512_XOR(tau1, sign);
...@@ -549,6 +551,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -549,6 +551,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign); h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
#endif #endif
#endif #endif
#ifdef HAVE_AVX512_XEON #ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL) #if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau2, sign); h1 = _AVX512_XOR(tau2, sign);
...@@ -613,7 +616,6 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -613,7 +616,6 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]); q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]);
q3 = _AVX512_FMA(x3, h1, q3); q3 = _AVX512_FMA(x3, h1, q3);
_AVX512_STORE(&q[(nb*ldq)+2*offset],q3); _AVX512_STORE(&q[(nb*ldq)+2*offset],q3);
>>>>>>> Skylake
} }
...@@ -693,6 +695,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -693,6 +695,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign); h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
#endif #endif
#endif #endif
#ifdef HAVE_AVX512_XEON #ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL) #if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau1, sign); h1 = _AVX512_XOR(tau1, sign);
...@@ -708,6 +711,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -708,6 +711,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign); h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
#endif #endif
#endif #endif
#ifdef HAVE_AVX512_XEON #ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL) #if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau2, sign); h1 = _AVX512_XOR(tau2, sign);
...@@ -758,8 +762,6 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -758,8 +762,6 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]); q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]);
q2 = _AVX512_FMA(x2, h1, q2); q2 = _AVX512_FMA(x2, h1, q2);
_AVX512_STORE(&q[(nb*ldq)+offset],q2); _AVX512_STORE(&q[(nb*ldq)+offset],q2);
>>>>>>> Skylake
} }
/** /**
...@@ -830,6 +832,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -830,6 +832,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign); h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
#endif #endif
#endif #endif
#ifdef HAVE_AVX512_XEON #ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL) #if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau1, sign); h1 = _AVX512_XOR(tau1, sign);
...@@ -837,6 +840,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -837,6 +840,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
#endif #endif
x1 = _AVX512_MUL(x1, h1); x1 = _AVX512_MUL(x1, h1);
#ifdef HAVE_AVX512_XEON_PHI #ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign); h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
...@@ -845,6 +849,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* ...@@ -845,6 +849,7 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign); h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
#endif #endif
#endif #endif
#ifdef HAVE_AVX512_XEON #ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL) #if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau2, sign); h1 = _AVX512_XOR(tau2, sign);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment