diff --git a/configure.ac b/configure.ac index f8a04d2e0b9874554c0e4446a49bd7af3e2e9a31..20a7ef34ea79f0ca0d9daba5190a8882989a0cf8 100644 --- a/configure.ac +++ b/configure.ac @@ -851,6 +851,45 @@ if test x"${need_avx512}" = x"yes"; then AC_MSG_ERROR([Could not compile a test program with AVX512, adjust the C compiler or CFLAGS]) fi AC_DEFINE([HAVE_AVX512],[1],[AVX512 is supported on this CPU]) + + if test x"$can_compile_avx512" = x"yes"; then + AC_MSG_CHECKING([whether we compile for Xeon PHI or Xeon]) + AC_COMPILE_IFELSE([AC_LANG_SOURCE([ + #include + int main(int argc, char **argv){ + __m512d sign; + __m512d h1; + + __m512d x1 = _mm512_xor_pd(h1_real, sign); + return 0; + } + ])], + [can_compile_avx512_xeon=yes], + [can_compile_avx512_xeon=no] + ) + AC_COMPILE_IFELSE([AC_LANG_SOURCE([ + #include + int main(int argc, char **argv){ + __m512d sign; + __m512d h1; + + __m512d x1 = (__mm512d) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign); + return 0; + } + ])], + [can_compile_avx512_xeon_phi=yes], + [can_compile_avx512_xeon_phi=no] + ) + if test x"$can_compile_avx512_xeon" = x"yes"; then + AC_DEFINE([HAVE_AVX512_XEON],[1],[AVX512 for Xeon is supported on this CPU]) + else + if test x"$can_compile_avx512_xeon_phi" = x"yes"; then + AC_DEFINE([HAVE_AVX512_XEON_PHI],[1],[AVX512 for Xeon-PHI is supported on this CPU]) + else + AC_MSG_ERROR([Oho! We can neither compile AVX512 intrinsics for Xeon nor Xeon Phi. This should not happen!]) + fi + fi + fi fi AC_LANG_POP([C]) diff --git a/src/elpa2/kernels/complex_avx512_1hv_template.c b/src/elpa2/kernels/complex_avx512_1hv_template.c index 953052462a8c8afcdd63b3e9b0c434f5660d3692..5d04b26cb5c4b8ef55264ec4b0ddafda48be1e1e 100644 --- a/src/elpa2/kernels/complex_avx512_1hv_template.c +++ b/src/elpa2/kernels/complex_avx512_1hv_template.c @@ -63,7 +63,9 @@ #define _AVX512_MUL _mm512_mul_pd #define _AVX512_ADD _mm512_add_pd #define _AVX512_SHUFFLE _mm512_shuffle_pd +#ifdef HAVE_AVX512_XEON #define _AVX512_XOR _mm512_xor_pd +#endif #define _AVX512_XOR_EPI _mm512_xor_epi64 #define _SHUFFLE 0x55 @@ -87,7 +89,9 @@ #define _AVX512_MUL _mm512_mul_ps #define _AVX512_ADD _mm512_add_ps #define _AVX512_SHUFFLE _mm512_shuffle_ps +#ifdef HAVE_AVX512_XEON #define _AVX512_XOR _mm512_xor_ps +#endif #define _AVX512_XOR_EPI _mm512_xor_epi32 #define _SHUFFLE 0xb1 @@ -361,6 +365,7 @@ static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float com h1_real = _AVX512_SET1(hh_dbl[0]); h1_imag = _AVX512_SET1(hh_dbl[1]); +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_COMPLEX h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); @@ -368,6 +373,13 @@ static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float com #ifdef SINGLE_PRECISION_COMPLEX h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); +#endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) + h1_real = _AVX512_XOR(h1_real, sign); + h1_imag = _AVX512_XOR(h1_imag, sign); +#endif #endif tmp1 = _AVX512_MUL(h1_imag, x1); @@ -539,6 +551,7 @@ static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float com h1_real = _AVX512_SET1(hh_dbl[0]); h1_imag = _AVX512_SET1(hh_dbl[1]); +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_COMPLEX h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); @@ -546,6 +559,13 @@ static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float com #ifdef SINGLE_PRECISION_COMPLEX h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); +#endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) + h1_real = _AVX512_XOR(h1_real, sign); + h1_imag = _AVX512_XOR(h1_imag, sign); +#endif #endif tmp1 = _AVX512_MUL(h1_imag, x1); @@ -688,9 +708,10 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float com x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE))); } - h1_real = _AVX512_SET1(hh_dbl[0]); - h1_imag = _AVX512_SET1(hh_dbl[1]); + h1_real = _AVX512_SET1(hh_dbl[0]); + h1_imag = _AVX512_SET1(hh_dbl[1]); +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_COMPLEX h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); @@ -698,6 +719,13 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float com #ifdef SINGLE_PRECISION_COMPLEX h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); +#endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) + h1_real = _AVX512_XOR(h1_real, sign); + h1_imag = _AVX512_XOR(h1_imag, sign); +#endif #endif tmp1 = _AVX512_MUL(h1_imag, x1); @@ -824,6 +852,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float com h1_real = _AVX512_SET1(hh_dbl[0]); h1_imag = _AVX512_SET1(hh_dbl[1]); +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_COMPLEX h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); @@ -831,6 +860,13 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float com #ifdef SINGLE_PRECISION_COMPLEX h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); +#endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) + h1_real = _AVX512_XOR(h1_real, sign); + h1_imag = _AVX512_XOR(h1_imag, sign); +#endif #endif tmp1 = _AVX512_MUL(h1_imag, x1); @@ -934,6 +970,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float com h1_real = _AVX512_SET1(hh_dbl[0]); h1_imag = _AVX512_SET1(hh_dbl[1]); +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_COMPLEX h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); @@ -941,6 +978,13 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float com #ifdef SINGLE_PRECISION_COMPLEX h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); +#endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) + h1_real = _AVX512_XOR(h1_real, sign); + h1_imag = _AVX512_XOR(h1_imag, sign); +#endif #endif tmp1 = _AVX512_MUL(h1_imag, x1); @@ -1022,9 +1066,10 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float comp x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE))); } - h1_real = _AVX512_SET1(hh_dbl[0]); - h1_imag = _AVX512_SET1(hh_dbl[1]); + h1_real = _AVX512_SET1(hh_dbl[0]); + h1_imag = _AVX512_SET1(hh_dbl[1]); +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_COMPLEX h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); @@ -1032,6 +1077,13 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float comp #ifdef SINGLE_PRECISION_COMPLEX h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); +#endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) + h1_real = _AVX512_XOR(h1_real, sign); + h1_imag = _AVX512_XOR(h1_imag, sign); +#endif #endif tmp1 = _AVX512_MUL(h1_imag, x1); diff --git a/src/elpa2/kernels/complex_avx512_2hv_template.c b/src/elpa2/kernels/complex_avx512_2hv_template.c index bce1bb4b3c4e05a7265c9023b007c9b1dd351054..05acc0e411ae1fa8289e59d54c9f478ce219784f 100644 --- a/src/elpa2/kernels/complex_avx512_2hv_template.c +++ b/src/elpa2/kernels/complex_avx512_2hv_template.c @@ -1,4 +1,4 @@ -// This file is part of ELPA. +XEON_PHI/ This file is part of ELPA. // // The ELPA library was originally created by the ELPA consortium, // consisting of the following organizations: @@ -65,6 +65,9 @@ #define _AVX512_ADD _mm512_add_pd #define _AVX512_MASK_STOREU _mm512_mask_storeu_pd #define _AVX512_SHUFFLE _mm512_shuffle_pd +#ifdef HAVE_AVX512_XEON +#define _AVX512_XOR _mm512_xor_pd +#endif #define _SHUFFLE 0x55 #ifdef HAVE_AVX512 @@ -90,6 +93,9 @@ #define _AVX512_ADD _mm512_add_ps #define _AVX512_MASK_STOREU _mm512_mask_storeu_ps #define _AVX512_SHUFFLE _mm512_shuffle_ps +#ifdef HAVE_AVX512_XEON +#define _AVX512_XOR _mm512_xor_ps +#endif #define _SHUFFLE 0xb1 #ifdef HAVE_AVX512 @@ -380,6 +386,7 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com h1_real = _AVX512_SET1(hh_dbl[0]); h1_imag = _AVX512_SET1(hh_dbl[1]); +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_COMPLEX h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign); @@ -388,7 +395,14 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign); #endif - tmp1 = _AVX512_MUL(h1_imag, x1); +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) + h1_real = _AVX512_XOR(h1_real, sign); + h1_imag = _AVX512_XOR(h1_imag, sign); +#endif +#endif + tmp1 = _AVX512_MUL(h1_imag, x1); x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)); @@ -409,6 +423,7 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com h2_real = _AVX512_SET1(hh_dbl[ldh*2]); h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]); +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_COMPLEX h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign); @@ -426,6 +441,15 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign); h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign); #endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) + h1_real = _AVX512_XOR(h1_real, sign); + h1_imag = _AVX512_XOR(h1_imag, sign); + h2_real = _AVX512_XOR(h2_real, sign); + h2_imag = _AVX512_XOR(h2_imag, sign); +#endif +#endif #ifdef DOUBLE_PRECISION_COMPLEX tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0], @@ -723,6 +747,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com h1_real = _AVX512_SET1(hh_dbl[0]); h1_imag = _AVX512_SET1(hh_dbl[1]); +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_COMPLEX h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign); @@ -731,7 +756,14 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign); #endif - tmp1 = _AVX512_MUL(h1_imag, x1); +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) + h1_real = _AVX512_XOR(h1_real, sign); + h1_imag = _AVX512_XOR(h1_imag, sign); +#endif +#endif + tmp1 = _AVX512_MUL(h1_imag, x1); x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)); @@ -748,6 +780,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com h2_real = _AVX512_SET1(hh_dbl[ldh*2]); h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]); +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_COMPLEX h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign); @@ -765,6 +798,15 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign); h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign); #endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) + h1_real = _AVX512_XOR(h1_real, sign); + h1_imag = _AVX512_XOR(h1_imag, sign); + h2_real = _AVX512_XOR(h2_real, sign); + h2_imag = _AVX512_XOR(h2_imag, sign); +#endif +#endif #ifdef DOUBLE_PRECISION_COMPLEX tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0], @@ -1008,6 +1050,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com h1_real = _AVX512_SET1(hh_dbl[0]); h1_imag = _AVX512_SET1(hh_dbl[1]); +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_COMPLEX h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign); @@ -1016,7 +1059,13 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign); #endif - +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) + h1_real = _AVX512_XOR(h1_real, sign); + h1_imag = _AVX512_XOR(h1_imag, sign); +#endif +#endif tmp1 = _AVX512_MUL(h1_imag, x1); @@ -1031,6 +1080,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com h2_real = _AVX512_SET1(hh_dbl[ldh*2]); h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]); +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_COMPLEX h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign); @@ -1048,6 +1098,15 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign); h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign); #endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) + h1_real = _AVX512_XOR(h1_real, sign); + h1_imag = _AVX512_XOR(h1_imag, sign); + h2_real = _AVX512_XOR(h2_real, sign); + h2_imag = _AVX512_XOR(h2_imag, sign); +#endif +#endif #ifdef DOUBLE_PRECISION_COMPLEX tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0], @@ -1238,6 +1297,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp h1_real = _AVX512_SET1(hh_dbl[0]); h1_imag = _AVX512_SET1(hh_dbl[1]); +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_COMPLEX h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign); @@ -1246,7 +1306,13 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign); #endif - +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) + h1_real = _AVX512_XOR(h1_real, sign); + h1_imag = _AVX512_XOR(h1_imag, sign); +#endif +#endif tmp1 = _AVX512_MUL(h1_imag, x1); @@ -1257,6 +1323,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp h2_real = _AVX512_SET1(hh_dbl[ldh*2]); h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]); +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_COMPLEX h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign); h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign); @@ -1274,6 +1341,15 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign); h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign); #endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) + h1_real = _AVX512_XOR(h1_real, sign); + h1_imag = _AVX512_XOR(h1_imag, sign); + h2_real = _AVX512_XOR(h2_real, sign); + h2_imag = _AVX512_XOR(h2_imag, sign); +#endif +#endif #ifdef DOUBLE_PRECISION_COMPLEX tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0], diff --git a/src/elpa2/kernels/real_avx512_2hv_template.c b/src/elpa2/kernels/real_avx512_2hv_template.c index 0b3deef11db668894011b5f1c3da799d47cf2c05..d9278eb68833d26ed8ddecde8e5942b0f3eeebcb 100644 --- a/src/elpa2/kernels/real_avx512_2hv_template.c +++ b/src/elpa2/kernels/real_avx512_2hv_template.c @@ -63,6 +63,9 @@ #define _AVX512_SET1 _mm512_set1_pd #define _AVX512_ADD _mm512_add_pd #define _AVX512_MUL _mm512_mul_pd +#ifdef HAVE_AVX512_XEON +#define _AVX512_XOR _mm512_xor_pd +#endif #ifdef HAVE_AVX512 #define __ELPA_USE_FMA__ @@ -82,6 +85,9 @@ #define _AVX512_SET1 _mm512_set1_ps #define _AVX512_ADD _mm512_add_ps #define _AVX512_MUL _mm512_mul_ps +#ifdef HAVE_AVX512_XEON +#define _AVX512_XOR _mm512_xor_ps +#endif #ifdef HAVE_AVX512 #define __ELPA_USE_FMA__ @@ -332,96 +338,110 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]); __AVX512_DATATYPE vs = _AVX512_SET1(s); +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_REAL h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign); #endif #ifdef SINGLE_PRECISION_REAL h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign); #endif - x1 = _AVX512_MUL(x1, h1); - x2 = _AVX512_MUL(x2, h1); - x3 = _AVX512_MUL(x3, h1); - x4 = _AVX512_MUL(x4, h1); - // check ofr xor_pd on skylake +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL) + h1 = _AVX512_XOR(tau1, sign); +#endif +#endif + x1 = _AVX512_MUL(x1, h1); + x2 = _AVX512_MUL(x2, h1); + x3 = _AVX512_MUL(x3, h1); + x4 = _AVX512_MUL(x4, h1); + +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_REAL h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign); #endif #ifdef SINGLE_PRECISION_REAL - h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign); -#endif - h2 = _AVX512_MUL(h1, vs); - y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2)); - y2 = _AVX512_FMA(y2, h1, _AVX512_MUL(x2,h2)); - y3 = _AVX512_FMA(y3, h1, _AVX512_MUL(x3,h2)); - y4 = _AVX512_FMA(y4, h1, _AVX512_MUL(x4,h2)); - - q1 = _AVX512_LOAD(q); - q1 = _AVX512_ADD(q1, y1); - _AVX512_STORE(q,q1); - q2 = _AVX512_LOAD(&q[offset]); - q2 = _AVX512_ADD(q2, y2); - _AVX512_STORE(&q[offset],q2); - q3 = _AVX512_LOAD(&q[2*offset]); - q3 = _AVX512_ADD(q3, y3); - _AVX512_STORE(&q[2*offset],q3); - q4 = _AVX512_LOAD(&q[3*offset]); - q4 = _AVX512_ADD(q4, y4); - _AVX512_STORE(&q[3*offset],q4); - - h2 = _AVX512_SET1(hh[ldh+1]); - - q1 = _AVX512_LOAD(&q[ldq]); - q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1)); - _AVX512_STORE(&q[ldq],q1); - q2 = _AVX512_LOAD(&q[ldq+offset]); - q2 = _AVX512_ADD(q2, _AVX512_FMA(y2, h2, x2)); - _AVX512_STORE(&q[ldq+offset],q2); - q3 = _AVX512_LOAD(&q[ldq+2*offset]); - q3 = _AVX512_ADD(q3, _AVX512_FMA(y3, h2, x3)); - _AVX512_STORE(&q[ldq+2*offset],q3); - q4 = _AVX512_LOAD(&q[ldq+3*offset]); - q4 = _AVX512_ADD(q4, _AVX512_FMA(y4, h2, x4)); - _AVX512_STORE(&q[ldq+3*offset],q4); - - for (i = 2; i < nb; i++) - { - h1 = _AVX512_SET1(hh[i-1]); - h2 = _AVX512_SET1(hh[ldh+i]); - - q1 = _AVX512_LOAD(&q[i*ldq]); - q1 = _AVX512_FMA(x1, h1, q1); - q1 = _AVX512_FMA(y1, h2, q1); - _AVX512_STORE(&q[i*ldq],q1); - q2 = _AVX512_LOAD(&q[(i*ldq)+offset]); - q2 = _AVX512_FMA(x2, h1, q2); - q2 = _AVX512_FMA(y2, h2, q2); - _AVX512_STORE(&q[(i*ldq)+offset],q2); - q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]); - q3 = _AVX512_FMA(x3, h1, q3); - q3 = _AVX512_FMA(y3, h2, q3); - _AVX512_STORE(&q[(i*ldq)+2*offset],q3); - q4 = _AVX512_LOAD(&q[(i*ldq)+3*offset]); - q4 = _AVX512_FMA(x4, h1, q4); - q4 = _AVX512_FMA(y4, h2, q4); - _AVX512_STORE(&q[(i*ldq)+3*offset],q4); - - } - - h1 = _AVX512_SET1(hh[nb-1]); - - q1 = _AVX512_LOAD(&q[nb*ldq]); - q1 = _AVX512_FMA(x1, h1, q1); - _AVX512_STORE(&q[nb*ldq],q1); - q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]); - q2 = _AVX512_FMA(x2, h1, q2); - _AVX512_STORE(&q[(nb*ldq)+offset],q2); - q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]); - q3 = _AVX512_FMA(x3, h1, q3); - _AVX512_STORE(&q[(nb*ldq)+2*offset],q3); - q4 = _AVX512_LOAD(&q[(nb*ldq)+3*offset]); - q4 = _AVX512_FMA(x4, h1, q4); - _AVX512_STORE(&q[(nb*ldq)+3*offset],q4); + h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign); +#endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL) + h1 = _AVX512_XOR(tau2, sign); +#endif +#endif + h2 = _AVX512_MUL(h1, vs); + y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2)); + y2 = _AVX512_FMA(y2, h1, _AVX512_MUL(x2,h2)); + y3 = _AVX512_FMA(y3, h1, _AVX512_MUL(x3,h2)); + y4 = _AVX512_FMA(y4, h1, _AVX512_MUL(x4,h2)); + + q1 = _AVX512_LOAD(q); + q1 = _AVX512_ADD(q1, y1); + _AVX512_STORE(q,q1); + q2 = _AVX512_LOAD(&q[offset]); + q2 = _AVX512_ADD(q2, y2); + _AVX512_STORE(&q[offset],q2); + q3 = _AVX512_LOAD(&q[2*offset]); + q3 = _AVX512_ADD(q3, y3); + _AVX512_STORE(&q[2*offset],q3); + q4 = _AVX512_LOAD(&q[3*offset]); + q4 = _AVX512_ADD(q4, y4); + _AVX512_STORE(&q[3*offset],q4); + + h2 = _AVX512_SET1(hh[ldh+1]); + + q1 = _AVX512_LOAD(&q[ldq]); + q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1)); + _AVX512_STORE(&q[ldq],q1); + q2 = _AVX512_LOAD(&q[ldq+offset]); + q2 = _AVX512_ADD(q2, _AVX512_FMA(y2, h2, x2)); + _AVX512_STORE(&q[ldq+offset],q2); + q3 = _AVX512_LOAD(&q[ldq+2*offset]); + q3 = _AVX512_ADD(q3, _AVX512_FMA(y3, h2, x3)); + _AVX512_STORE(&q[ldq+2*offset],q3); + q4 = _AVX512_LOAD(&q[ldq+3*offset]); + q4 = _AVX512_ADD(q4, _AVX512_FMA(y4, h2, x4)); + _AVX512_STORE(&q[ldq+3*offset],q4); + + for (i = 2; i < nb; i++) + { + h1 = _AVX512_SET1(hh[i-1]); + h2 = _AVX512_SET1(hh[ldh+i]); + + q1 = _AVX512_LOAD(&q[i*ldq]); + q1 = _AVX512_FMA(x1, h1, q1); + q1 = _AVX512_FMA(y1, h2, q1); + _AVX512_STORE(&q[i*ldq],q1); + q2 = _AVX512_LOAD(&q[(i*ldq)+offset]); + q2 = _AVX512_FMA(x2, h1, q2); + q2 = _AVX512_FMA(y2, h2, q2); + _AVX512_STORE(&q[(i*ldq)+offset],q2); + q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]); + q3 = _AVX512_FMA(x3, h1, q3); + q3 = _AVX512_FMA(y3, h2, q3); + _AVX512_STORE(&q[(i*ldq)+2*offset],q3); + q4 = _AVX512_LOAD(&q[(i*ldq)+3*offset]); + q4 = _AVX512_FMA(x4, h1, q4); + q4 = _AVX512_FMA(y4, h2, q4); + _AVX512_STORE(&q[(i*ldq)+3*offset],q4); + + } + + h1 = _AVX512_SET1(hh[nb-1]); + + q1 = _AVX512_LOAD(&q[nb*ldq]); + q1 = _AVX512_FMA(x1, h1, q1); + _AVX512_STORE(&q[nb*ldq],q1); + q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]); + q2 = _AVX512_FMA(x2, h1, q2); + _AVX512_STORE(&q[(nb*ldq)+offset],q2); + q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]); + q3 = _AVX512_FMA(x3, h1, q3); + _AVX512_STORE(&q[(nb*ldq)+2*offset],q3); + q4 = _AVX512_LOAD(&q[(nb*ldq)+3*offset]); + q4 = _AVX512_FMA(x4, h1, q4); + _AVX512_STORE(&q[(nb*ldq)+3*offset],q4); +>>>>>>> Skylake } @@ -503,81 +523,97 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]); __AVX512_DATATYPE vs = _AVX512_SET1(s); - // check for xor_pd on skylake +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_REAL h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign); #endif #ifdef SINGLE_PRECISION_REAL h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign); #endif - x1 = _AVX512_MUL(x1, h1); - x2 = _AVX512_MUL(x2, h1); - x3 = _AVX512_MUL(x3, h1); +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL) + h1 = _AVX512_XOR(tau1, sign); +#endif +#endif + + x1 = _AVX512_MUL(x1, h1); + x2 = _AVX512_MUL(x2, h1); + x3 = _AVX512_MUL(x3, h1); +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_REAL h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign); #endif #ifdef SINGLE_PRECISION_REAL - h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign); -#endif - h2 = _AVX512_MUL(h1, vs); - y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2)); - y2 = _AVX512_FMA(y2, h1, _AVX512_MUL(x2,h2)); - y3 = _AVX512_FMA(y3, h1, _AVX512_MUL(x3,h2)); - - q1 = _AVX512_LOAD(q); - q1 = _AVX512_ADD(q1, y1); - _AVX512_STORE(q,q1); - q2 = _AVX512_LOAD(&q[offset]); - q2 = _AVX512_ADD(q2, y2); - _AVX512_STORE(&q[offset],q2); - q3 = _AVX512_LOAD(&q[2*offset]); - q3 = _AVX512_ADD(q3, y3); - _AVX512_STORE(&q[2*offset],q3); - - h2 = _AVX512_SET1(hh[ldh+1]); - - q1 = _AVX512_LOAD(&q[ldq]); - q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1)); - _AVX512_STORE(&q[ldq],q1); - q2 = _AVX512_LOAD(&q[ldq+offset]); - q2 = _AVX512_ADD(q2, _AVX512_FMA(y2, h2, x2)); - _AVX512_STORE(&q[ldq+offset],q2); - q3 = _AVX512_LOAD(&q[ldq+2*offset]); - q3 = _AVX512_ADD(q3, _AVX512_FMA(y3, h2, x3)); - _AVX512_STORE(&q[ldq+2*offset],q3); - - for (i = 2; i < nb; i++) - { - h1 = _AVX512_SET1(hh[i-1]); - h2 = _AVX512_SET1(hh[ldh+i]); - - q1 = _AVX512_LOAD(&q[i*ldq]); - q1 = _AVX512_FMA(x1, h1, q1); - q1 = _AVX512_FMA(y1, h2, q1); - _AVX512_STORE(&q[i*ldq],q1); - q2 = _AVX512_LOAD(&q[(i*ldq)+offset]); - q2 = _AVX512_FMA(x2, h1, q2); - q2 = _AVX512_FMA(y2, h2, q2); - _AVX512_STORE(&q[(i*ldq)+offset],q2); - q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]); - q3 = _AVX512_FMA(x3, h1, q3); - q3 = _AVX512_FMA(y3, h2, q3); - _AVX512_STORE(&q[(i*ldq)+2*offset],q3); - - } - - h1 = _AVX512_SET1(hh[nb-1]); - - q1 = _AVX512_LOAD(&q[nb*ldq]); - q1 = _AVX512_FMA(x1, h1, q1); - _AVX512_STORE(&q[nb*ldq],q1); - q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]); - q2 = _AVX512_FMA(x2, h1, q2); - _AVX512_STORE(&q[(nb*ldq)+offset],q2); - q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]); - q3 = _AVX512_FMA(x3, h1, q3); - _AVX512_STORE(&q[(nb*ldq)+2*offset],q3); + h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign); +#endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL) + h1 = _AVX512_XOR(tau2, sign); +#endif +#endif + + h2 = _AVX512_MUL(h1, vs); + y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2)); + y2 = _AVX512_FMA(y2, h1, _AVX512_MUL(x2,h2)); + y3 = _AVX512_FMA(y3, h1, _AVX512_MUL(x3,h2)); + + q1 = _AVX512_LOAD(q); + q1 = _AVX512_ADD(q1, y1); + _AVX512_STORE(q,q1); + q2 = _AVX512_LOAD(&q[offset]); + q2 = _AVX512_ADD(q2, y2); + _AVX512_STORE(&q[offset],q2); + q3 = _AVX512_LOAD(&q[2*offset]); + q3 = _AVX512_ADD(q3, y3); + _AVX512_STORE(&q[2*offset],q3); + + h2 = _AVX512_SET1(hh[ldh+1]); + + q1 = _AVX512_LOAD(&q[ldq]); + q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1)); + _AVX512_STORE(&q[ldq],q1); + q2 = _AVX512_LOAD(&q[ldq+offset]); + q2 = _AVX512_ADD(q2, _AVX512_FMA(y2, h2, x2)); + _AVX512_STORE(&q[ldq+offset],q2); + q3 = _AVX512_LOAD(&q[ldq+2*offset]); + q3 = _AVX512_ADD(q3, _AVX512_FMA(y3, h2, x3)); + _AVX512_STORE(&q[ldq+2*offset],q3); + + for (i = 2; i < nb; i++) + { + h1 = _AVX512_SET1(hh[i-1]); + h2 = _AVX512_SET1(hh[ldh+i]); + + q1 = _AVX512_LOAD(&q[i*ldq]); + q1 = _AVX512_FMA(x1, h1, q1); + q1 = _AVX512_FMA(y1, h2, q1); + _AVX512_STORE(&q[i*ldq],q1); + q2 = _AVX512_LOAD(&q[(i*ldq)+offset]); + q2 = _AVX512_FMA(x2, h1, q2); + q2 = _AVX512_FMA(y2, h2, q2); + _AVX512_STORE(&q[(i*ldq)+offset],q2); + q3 = _AVX512_LOAD(&q[(i*ldq)+2*offset]); + q3 = _AVX512_FMA(x3, h1, q3); + q3 = _AVX512_FMA(y3, h2, q3); + _AVX512_STORE(&q[(i*ldq)+2*offset],q3); + + } + + h1 = _AVX512_SET1(hh[nb-1]); + + q1 = _AVX512_LOAD(&q[nb*ldq]); + q1 = _AVX512_FMA(x1, h1, q1); + _AVX512_STORE(&q[nb*ldq],q1); + q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]); + q2 = _AVX512_FMA(x2, h1, q2); + _AVX512_STORE(&q[(nb*ldq)+offset],q2); + q3 = _AVX512_LOAD(&q[(nb*ldq)+2*offset]); + q3 = _AVX512_FMA(x3, h1, q3); + _AVX512_STORE(&q[(nb*ldq)+2*offset],q3); +>>>>>>> Skylake } @@ -646,68 +682,83 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* // Rank-2 update of Q [16 x nb+1] ///////////////////////////////////////////////////// - __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]); - __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]); - __AVX512_DATATYPE vs = _AVX512_SET1(s); - // check for xor_pd on skylake + __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]); + __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]); + __AVX512_DATATYPE vs = _AVX512_SET1(s); +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_REAL h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign); #endif #ifdef SINGLE_PRECISION_REAL h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign); #endif - x1 = _AVX512_MUL(x1, h1); - x2 = _AVX512_MUL(x2, h1); +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL) + h1 = _AVX512_XOR(tau1, sign); +#endif +#endif + x1 = _AVX512_MUL(x1, h1); + x2 = _AVX512_MUL(x2, h1); +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_REAL h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign); #endif #ifdef SINGLE_PRECISION_REAL - h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign); + h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign); +#endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL) + h1 = _AVX512_XOR(tau2, sign); +#endif #endif - h2 = _AVX512_MUL(h1, vs); - y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2)); - y2 = _AVX512_FMA(y2, h1, _AVX512_MUL(x2,h2)); + h2 = _AVX512_MUL(h1, vs); - q1 = _AVX512_LOAD(q); - q1 = _AVX512_ADD(q1, y1); - _AVX512_STORE(q,q1); - q2 = _AVX512_LOAD(&q[offset]); - q2 = _AVX512_ADD(q2, y2); - _AVX512_STORE(&q[offset],q2); + y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2)); + y2 = _AVX512_FMA(y2, h1, _AVX512_MUL(x2,h2)); - h2 = _AVX512_SET1(hh[ldh+1]); + q1 = _AVX512_LOAD(q); + q1 = _AVX512_ADD(q1, y1); + _AVX512_STORE(q,q1); + q2 = _AVX512_LOAD(&q[offset]); + q2 = _AVX512_ADD(q2, y2); + _AVX512_STORE(&q[offset],q2); - q1 = _AVX512_LOAD(&q[ldq]); - q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1)); - _AVX512_STORE(&q[ldq],q1); - q2 = _AVX512_LOAD(&q[ldq+offset]); - q2 = _AVX512_ADD(q2, _AVX512_FMA(y2, h2, x2)); - _AVX512_STORE(&q[ldq+offset],q2); + h2 = _AVX512_SET1(hh[ldh+1]); - for (i = 2; i < nb; i++) - { - h1 = _AVX512_SET1(hh[i-1]); - h2 = _AVX512_SET1(hh[ldh+i]); + q1 = _AVX512_LOAD(&q[ldq]); + q1 = _AVX512_ADD(q1, _AVX512_FMA(y1, h2, x1)); + _AVX512_STORE(&q[ldq],q1); + q2 = _AVX512_LOAD(&q[ldq+offset]); + q2 = _AVX512_ADD(q2, _AVX512_FMA(y2, h2, x2)); + _AVX512_STORE(&q[ldq+offset],q2); - q1 = _AVX512_LOAD(&q[i*ldq]); - q1 = _AVX512_FMA(x1, h1, q1); - q1 = _AVX512_FMA(y1, h2, q1); - _AVX512_STORE(&q[i*ldq],q1); - q2 = _AVX512_LOAD(&q[(i*ldq)+offset]); - q2 = _AVX512_FMA(x2, h1, q2); - q2 = _AVX512_FMA(y2, h2, q2); - _AVX512_STORE(&q[(i*ldq)+offset],q2); - } + for (i = 2; i < nb; i++) + { + h1 = _AVX512_SET1(hh[i-1]); + h2 = _AVX512_SET1(hh[ldh+i]); - h1 = _AVX512_SET1(hh[nb-1]); + q1 = _AVX512_LOAD(&q[i*ldq]); + q1 = _AVX512_FMA(x1, h1, q1); + q1 = _AVX512_FMA(y1, h2, q1); + _AVX512_STORE(&q[i*ldq],q1); + q2 = _AVX512_LOAD(&q[(i*ldq)+offset]); + q2 = _AVX512_FMA(x2, h1, q2); + q2 = _AVX512_FMA(y2, h2, q2); + _AVX512_STORE(&q[(i*ldq)+offset],q2); + } - q1 = _AVX512_LOAD(&q[nb*ldq]); - q1 = _AVX512_FMA(x1, h1, q1); - _AVX512_STORE(&q[nb*ldq],q1); - q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]); - q2 = _AVX512_FMA(x2, h1, q2); - _AVX512_STORE(&q[(nb*ldq)+offset],q2); + h1 = _AVX512_SET1(hh[nb-1]); + + q1 = _AVX512_LOAD(&q[nb*ldq]); + q1 = _AVX512_FMA(x1, h1, q1); + _AVX512_STORE(&q[nb*ldq],q1); + q2 = _AVX512_LOAD(&q[(nb*ldq)+offset]); + q2 = _AVX512_FMA(x2, h1, q2); + _AVX512_STORE(&q[(nb*ldq)+offset],q2); +>>>>>>> Skylake } @@ -768,24 +819,36 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int* // Rank-2 update of Q [8 x nb+1] ///////////////////////////////////////////////////// - __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]); - __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]); - __AVX512_DATATYPE vs = _AVX512_SET1(s); - + __AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]); + __AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]); + __AVX512_DATATYPE vs = _AVX512_SET1(s); +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_REAL h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign); #endif #ifdef SINGLE_PRECISION_REAL h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign); +#endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL) + h1 = _AVX512_XOR(tau1, sign); +#endif #endif - x1 = _AVX512_MUL(x1, h1); - + x1 = _AVX512_MUL(x1, h1); +#ifdef HAVE_AVX512_XEON_PHI #ifdef DOUBLE_PRECISION_REAL h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign); #endif #ifdef SINGLE_PRECISION_REAL h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign); +#endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL) + h1 = _AVX512_XOR(tau2, sign); +#endif #endif h2 = _AVX512_MUL(h1, vs);