Commit 9516f944 authored by Andreas Marek's avatar Andreas Marek

Kernels for Skylake

parent 223e1190
......@@ -786,6 +786,45 @@ if test x"${need_avx512}" = x"yes"; then
AC_MSG_ERROR([Could not compile a test program with AVX512, adjust the C compiler or CFLAGS])
fi
AC_DEFINE([HAVE_AVX512],[1],[AVX512 is supported on this CPU])
if test x"$can_compile_avx512" = x"yes"; then
AC_MSG_CHECKING([whether we compile for Xeon PHI or Xeon])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
__m512d sign;
__m512d h1;
__m512d x1 = _mm512_xor_pd(h1_real, sign);
return 0;
}
])],
[can_compile_avx512_xeon=yes],
[can_compile_avx512_xeon=no]
)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
__m512d sign;
__m512d h1;
__m512d x1 = (__mm512d) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
return 0;
}
])],
[can_compile_avx512_xeon_phi=yes],
[can_compile_avx512_xeon_phi=no]
)
if test x"$can_compile_avx512_xeon" = x"yes"; then
AC_DEFINE([HAVE_AVX512_XEON],[1],[AVX512 for Xeon is supported on this CPU])
else
if test x"$can_compile_avx512_xeon_phi" = x"yes"; then
AC_DEFINE([HAVE_AVX512_XEON_PHI],[1],[AVX512 for Xeon-PHI is supported on this CPU])
else
AC_MSG_ERROR([Oho! We can neither compile AVX512 intrinsics for Xeon nor Xeon Phi. This should not happen!])
fi
fi
fi
fi
AC_LANG_POP([C])
......
......@@ -63,7 +63,9 @@
#define _AVX512_MUL _mm512_mul_pd
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_SHUFFLE _mm512_shuffle_pd
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_pd
#endif
#define _AVX512_XOR_EPI _mm512_xor_epi64
#define _SHUFFLE 0x55
......@@ -87,7 +89,9 @@
#define _AVX512_MUL _mm512_mul_ps
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_SHUFFLE _mm512_shuffle_ps
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_ps
#endif
#define _AVX512_XOR_EPI _mm512_xor_epi32
#define _SHUFFLE 0xb1
......@@ -361,6 +365,7 @@ static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
......@@ -368,6 +373,13 @@ static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -539,6 +551,7 @@ static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
......@@ -546,6 +559,13 @@ static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -690,7 +710,7 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
......@@ -698,6 +718,13 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -824,6 +851,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
......@@ -831,6 +859,13 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -934,6 +969,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
......@@ -941,6 +977,13 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -1024,7 +1067,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float comp
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
......@@ -1032,6 +1075,13 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float comp
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......
// This file is part of ELPA.
XEON_PHI/ This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
......@@ -65,6 +65,9 @@
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_MASK_STOREU _mm512_mask_storeu_pd
#define _AVX512_SHUFFLE _mm512_shuffle_pd
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_pd
#endif
#define _SHUFFLE 0x55
#ifdef HAVE_AVX512
......@@ -90,6 +93,9 @@
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_MASK_STOREU _mm512_mask_storeu_ps
#define _AVX512_SHUFFLE _mm512_shuffle_ps
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_ps
#endif
#define _SHUFFLE 0xb1
#ifdef HAVE_AVX512
......@@ -380,6 +386,7 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -387,6 +394,13 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -409,6 +423,7 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -426,6 +441,15 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h2_real = _AVX512_XOR(h2_real, sign);
h2_imag = _AVX512_XOR(h2_imag, sign);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
......@@ -723,6 +747,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -730,6 +755,13 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -748,6 +780,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -765,6 +798,15 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h2_real = _AVX512_XOR(h2_real, sign);
h2_imag = _AVX512_XOR(h2_imag, sign);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
......@@ -1008,6 +1050,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -1016,7 +1059,13 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -1031,6 +1080,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -1048,6 +1098,15 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h2_real = _AVX512_XOR(h2_real, sign);
h2_imag = _AVX512_XOR(h2_imag, sign);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
......@@ -1238,6 +1297,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -1246,7 +1306,13 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -1257,6 +1323,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -1274,6 +1341,15 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h2_real = _AVX512_XOR(h2_real, sign);
h2_imag = _AVX512_XOR(h2_imag, sign);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
......
......@@ -63,6 +63,9 @@
#define _AVX512_SET1 _mm512_set1_pd
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_MUL _mm512_mul_pd
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_pd
#endif
#ifdef HAVE_AVX512
#define __ELPA_USE_FMA__
......@@ -82,6 +85,9 @@
#define _AVX512_SET1 _mm512_set1_ps
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_MUL _mm512_mul_ps
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_ps
#endif
#ifdef HAVE_AVX512
#define __ELPA_USE_FMA__
......@@ -332,23 +338,36 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
__AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
__AVX512_DATATYPE vs = _AVX512_SET1(s);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau1, sign);
#endif
#endif
x1 = _AVX512_MUL(x1, h1);
x2 = _AVX512_MUL(x2, h1);
x3 = _AVX512_MUL(x3, h1);
x4 = _AVX512_MUL(x4, h1);
// check ofr xor_pd on skylake
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau2, sign);
#endif
#endif
h2 = _AVX512_MUL(h1, vs);
y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
......@@ -503,23 +522,38 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
__AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
__AVX512_DATATYPE vs = _AVX512_SET1(s);
// check for xor_pd on skylake
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau1, sign);
#endif
#endif
x1 = _AVX512_MUL(x1, h1);
x2 = _AVX512_MUL(x2, h1);
x3 = _AVX512_MUL(x3, h1);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau2, sign);
#endif
#endif
h2 = _AVX512_MUL(h1, vs);
y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
y2 = _AVX512_FMA(y2, h1, _AVX512_MUL(x2,h2));
......@@ -649,21 +683,35 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
__AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
__AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
__AVX512_DATATYPE vs = _AVX512_SET1(s);
// check for xor_pd on skylake
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau1, sign);
#endif
#endif
x1 = _AVX512_MUL(x1, h1);
x2 = _AVX512_MUL(x2, h1);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau2, sign);
#endif
#endif
h2 = _AVX512_MUL(h1, vs);
y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
......@@ -771,21 +819,34 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
__AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
__AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
__AVX512_DATATYPE vs = _AVX512_SET1(s);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau1, sign);
#endif
#endif
x1 = _AVX512_MUL(x1, h1);
x1 = _AVX512_MUL(x1, h1);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
#endif
#ifdef SINGLE_PRECISION_REAL
h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1 = _AVX512_XOR(tau2, sign);
#endif
#endif
h2 = _AVX512_MUL(h1, vs);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment