Commit 8596c33e authored by Andreas Marek's avatar Andreas Marek

Merge branch 'Skylake' into master_pre_stage

parents 50ab7a76 9516f944
......@@ -851,6 +851,45 @@ if test x"${need_avx512}" = x"yes"; then
AC_MSG_ERROR([Could not compile a test program with AVX512, adjust the C compiler or CFLAGS])
fi
AC_DEFINE([HAVE_AVX512],[1],[AVX512 is supported on this CPU])
if test x"$can_compile_avx512" = x"yes"; then
AC_MSG_CHECKING([whether we compile for Xeon PHI or Xeon])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
__m512d sign;
__m512d h1;
__m512d x1 = _mm512_xor_pd(h1_real, sign);
return 0;
}
])],
[can_compile_avx512_xeon=yes],
[can_compile_avx512_xeon=no]
)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
__m512d sign;
__m512d h1;
__m512d x1 = (__mm512d) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
return 0;
}
])],
[can_compile_avx512_xeon_phi=yes],
[can_compile_avx512_xeon_phi=no]
)
if test x"$can_compile_avx512_xeon" = x"yes"; then
AC_DEFINE([HAVE_AVX512_XEON],[1],[AVX512 for Xeon is supported on this CPU])
else
if test x"$can_compile_avx512_xeon_phi" = x"yes"; then
AC_DEFINE([HAVE_AVX512_XEON_PHI],[1],[AVX512 for Xeon-PHI is supported on this CPU])
else
AC_MSG_ERROR([Oho! We can neither compile AVX512 intrinsics for Xeon nor Xeon Phi. This should not happen!])
fi
fi
fi
fi
AC_LANG_POP([C])
......
......@@ -63,7 +63,9 @@
#define _AVX512_MUL _mm512_mul_pd
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_SHUFFLE _mm512_shuffle_pd
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_pd
#endif
#define _AVX512_XOR_EPI _mm512_xor_epi64
#define _SHUFFLE 0x55
......@@ -87,7 +89,9 @@
#define _AVX512_MUL _mm512_mul_ps
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_SHUFFLE _mm512_shuffle_ps
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_ps
#endif
#define _AVX512_XOR_EPI _mm512_xor_epi32
#define _SHUFFLE 0xb1
......@@ -361,6 +365,7 @@ static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
......@@ -368,6 +373,13 @@ static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -539,6 +551,7 @@ static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
......@@ -546,6 +559,13 @@ static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -688,9 +708,10 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float com
x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
}
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
......@@ -698,6 +719,13 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -824,6 +852,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
......@@ -831,6 +860,13 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -934,6 +970,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
......@@ -941,6 +978,13 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -1022,9 +1066,10 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float comp
x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
}
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
......@@ -1032,6 +1077,13 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float comp
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......
// This file is part of ELPA.
XEON_PHI/ This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
......@@ -65,6 +65,9 @@
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_MASK_STOREU _mm512_mask_storeu_pd
#define _AVX512_SHUFFLE _mm512_shuffle_pd
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_pd
#endif
#define _SHUFFLE 0x55
#ifdef HAVE_AVX512
......@@ -90,6 +93,9 @@
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_MASK_STOREU _mm512_mask_storeu_ps
#define _AVX512_SHUFFLE _mm512_shuffle_ps
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_ps
#endif
#define _SHUFFLE 0xb1
#ifdef HAVE_AVX512
......@@ -380,6 +386,7 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -388,7 +395,14 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
......@@ -409,6 +423,7 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -426,6 +441,15 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h2_real = _AVX512_XOR(h2_real, sign);
h2_imag = _AVX512_XOR(h2_imag, sign);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
......@@ -723,6 +747,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -731,7 +756,14 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
......@@ -748,6 +780,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -765,6 +798,15 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h2_real = _AVX512_XOR(h2_real, sign);
h2_imag = _AVX512_XOR(h2_imag, sign);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
......@@ -1008,6 +1050,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -1016,7 +1059,13 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -1031,6 +1080,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -1048,6 +1098,15 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h2_real = _AVX512_XOR(h2_real, sign);
h2_imag = _AVX512_XOR(h2_imag, sign);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
......@@ -1238,6 +1297,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -1246,7 +1306,13 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -1257,6 +1323,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -1274,6 +1341,15 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h2_real = _AVX512_XOR(h2_real, sign);
h2_imag = _AVX512_XOR(h2_imag, sign);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment