Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
9516f944
Commit
9516f944
authored
Nov 15, 2017
by
Andreas Marek
Browse files
Kernels for Skylake
parent
223e1190
Changes
4
Hide whitespace changes
Inline
Side-by-side
configure.ac
View file @
9516f944
...
...
@@ -786,6 +786,45 @@ if test x"${need_avx512}" = x"yes"; then
AC_MSG_ERROR([Could not compile a test program with AVX512, adjust the C compiler or CFLAGS])
fi
AC_DEFINE([HAVE_AVX512],[1],[AVX512 is supported on this CPU])
if test x"$can_compile_avx512" = x"yes"; then
AC_MSG_CHECKING([whether we compile for Xeon PHI or Xeon])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
__m512d sign;
__m512d h1;
__m512d x1 = _mm512_xor_pd(h1_real, sign);
return 0;
}
])],
[can_compile_avx512_xeon=yes],
[can_compile_avx512_xeon=no]
)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
__m512d sign;
__m512d h1;
__m512d x1 = (__mm512d) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
return 0;
}
])],
[can_compile_avx512_xeon_phi=yes],
[can_compile_avx512_xeon_phi=no]
)
if test x"$can_compile_avx512_xeon" = x"yes"; then
AC_DEFINE([HAVE_AVX512_XEON],[1],[AVX512 for Xeon is supported on this CPU])
else
if test x"$can_compile_avx512_xeon_phi" = x"yes"; then
AC_DEFINE([HAVE_AVX512_XEON_PHI],[1],[AVX512 for Xeon-PHI is supported on this CPU])
else
AC_MSG_ERROR([Oho! We can neither compile AVX512 intrinsics for Xeon nor Xeon Phi. This should not happen!])
fi
fi
fi
fi
AC_LANG_POP([C])
...
...
src/elpa2/kernels/complex_avx512_1hv_template.c
View file @
9516f944
...
...
@@ -63,7 +63,9 @@
#define _AVX512_MUL _mm512_mul_pd
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_SHUFFLE _mm512_shuffle_pd
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_pd
#endif
#define _AVX512_XOR_EPI _mm512_xor_epi64
#define _SHUFFLE 0x55
...
...
@@ -87,7 +89,9 @@
#define _AVX512_MUL _mm512_mul_ps
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_SHUFFLE _mm512_shuffle_ps
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_ps
#endif
#define _AVX512_XOR_EPI _mm512_xor_epi32
#define _SHUFFLE 0xb1
...
...
@@ -361,6 +365,7 @@ static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -368,6 +373,13 @@ static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
...
@@ -539,6 +551,7 @@ static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -546,6 +559,13 @@ static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
...
@@ -690,7 +710,7 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -698,6 +718,13 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
...
@@ -824,6 +851,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -831,6 +859,13 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
...
@@ -934,6 +969,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -941,6 +977,13 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
...
@@ -1024,7 +1067,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float comp
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -1032,6 +1075,13 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float comp
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
...
src/elpa2/kernels/complex_avx512_2hv_template.c
View file @
9516f944
/
/ This file is part of ELPA.
XEON_PHI
/
This
file
is
part
of
ELPA
.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
...
...
@@ -65,6 +65,9 @@
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_MASK_STOREU _mm512_mask_storeu_pd
#define _AVX512_SHUFFLE _mm512_shuffle_pd
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_pd
#endif
#define _SHUFFLE 0x55
#ifdef HAVE_AVX512
...
...
@@ -90,6 +93,9 @@
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_MASK_STOREU _mm512_mask_storeu_ps
#define _AVX512_SHUFFLE _mm512_shuffle_ps
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_ps
#endif
#define _SHUFFLE 0xb1
#ifdef HAVE_AVX512
...
...
@@ -380,6 +386,7 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -387,6 +394,13 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
...
@@ -409,6 +423,7 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h2_real
=
_AVX512_SET1
(
hh_dbl
[
ldh
*
2
]);
h2_imag
=
_AVX512_SET1
(
hh_dbl
[(
ldh
*
2
)
+
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -426,6 +441,15 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h2_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_real
,
(
__m512i
)
sign
);
h2_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
h2_real
=
_AVX512_XOR
(
h2_real
,
sign
);
h2_imag
=
_AVX512_XOR
(
h2_imag
,
sign
);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2
=
_AVX512_SET
(
s_dbl
[
1
],
s_dbl
[
0
],
...
...
@@ -723,6 +747,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -730,6 +755,13 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
...
@@ -748,6 +780,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h2_real
=
_AVX512_SET1
(
hh_dbl
[
ldh
*
2
]);
h2_imag
=
_AVX512_SET1
(
hh_dbl
[(
ldh
*
2
)
+
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -765,6 +798,15 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h2_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_real
,
(
__m512i
)
sign
);
h2_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
h2_real
=
_AVX512_XOR
(
h2_real
,
sign
);
h2_imag
=
_AVX512_XOR
(
h2_imag
,
sign
);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2
=
_AVX512_SET
(
s_dbl
[
1
],
s_dbl
[
0
],
...
...
@@ -1008,6 +1050,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -1016,7 +1059,13 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
...
@@ -1031,6 +1080,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h2_real
=
_AVX512_SET1
(
hh_dbl
[
ldh
*
2
]);
h2_imag
=
_AVX512_SET1
(
hh_dbl
[(
ldh
*
2
)
+
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -1048,6 +1098,15 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h2_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_real
,
(
__m512i
)
sign
);
h2_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
h2_real
=
_AVX512_XOR
(
h2_real
,
sign
);
h2_imag
=
_AVX512_XOR
(
h2_imag
,
sign
);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2
=
_AVX512_SET
(
s_dbl
[
1
],
s_dbl
[
0
],
...
...
@@ -1238,6 +1297,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -1246,7 +1306,13 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
...
@@ -1257,6 +1323,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h2_real
=
_AVX512_SET1
(
hh_dbl
[
ldh
*
2
]);
h2_imag
=
_AVX512_SET1
(
hh_dbl
[(
ldh
*
2
)
+
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -1274,6 +1341,15 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h2_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_real
,
(
__m512i
)
sign
);
h2_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
h2_real
=
_AVX512_XOR
(
h2_real
,
sign
);
h2_imag
=
_AVX512_XOR
(
h2_imag
,
sign
);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2
=
_AVX512_SET
(
s_dbl
[
1
],
s_dbl
[
0
],
...
...
src/elpa2/kernels/real_avx512_2hv_template.c
View file @
9516f944
...
...
@@ -63,6 +63,9 @@
#define _AVX512_SET1 _mm512_set1_pd
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_MUL _mm512_mul_pd
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_pd
#endif
#ifdef HAVE_AVX512
#define __ELPA_USE_FMA__
...
...
@@ -82,6 +85,9 @@
#define _AVX512_SET1 _mm512_set1_ps
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_MUL _mm512_mul_ps
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_ps
#endif
#ifdef HAVE_AVX512
#define __ELPA_USE_FMA__
...
...
@@ -332,23 +338,36 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
__AVX512_DATATYPE
tau2
=
_AVX512_SET1
(
hh
[
ldh
]);
__AVX512_DATATYPE
vs
=
_AVX512_SET1
(
s
);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__AVX512i
)
tau1
,
(
__AVX512i
)
sign
);
#endif
#ifdef SINGLE_PRECISION_REAL
h1
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__AVX512i
)
tau1
,
(
__AVX512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1
=
_AVX512_XOR
(
tau1
,
sign
);
#endif
#endif
x1
=
_AVX512_MUL
(
x1
,
h1
);
x2
=
_AVX512_MUL
(
x2
,
h1
);
x3
=
_AVX512_MUL
(
x3
,
h1
);
x4
=
_AVX512_MUL
(
x4
,
h1
);
// check ofr xor_pd on skylake
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__AVX512i
)
tau2
,
(
__AVX512i
)
sign
);
#endif
#ifdef SINGLE_PRECISION_REAL
h1
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__AVX512i
)
tau2
,
(
__AVX512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1
=
_AVX512_XOR
(
tau2
,
sign
);
#endif
#endif
h2
=
_AVX512_MUL
(
h1
,
vs
);
y1
=
_AVX512_FMA
(
y1
,
h1
,
_AVX512_MUL
(
x1
,
h2
));
...
...
@@ -503,23 +522,38 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
__AVX512_DATATYPE
tau2
=
_AVX512_SET1
(
hh
[
ldh
]);
__AVX512_DATATYPE
vs
=
_AVX512_SET1
(
s
);
// check for xor_pd on skylake
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__AVX512i
)
tau1
,
(
__AVX512i
)
sign
);
#endif
#ifdef SINGLE_PRECISION_REAL
h1
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__AVX512i
)
tau1
,
(
__AVX512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1
=
_AVX512_XOR
(
tau1
,
sign
);
#endif
#endif
x1
=
_AVX512_MUL
(
x1
,
h1
);
x2
=
_AVX512_MUL
(
x2
,
h1
);
x3
=
_AVX512_MUL
(
x3
,
h1
);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__AVX512i
)
tau2
,
(
__AVX512i
)
sign
);
#endif
#ifdef SINGLE_PRECISION_REAL
h1
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__AVX512i
)
tau2
,
(
__AVX512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1
=
_AVX512_XOR
(
tau2
,
sign
);
#endif
#endif
h2
=
_AVX512_MUL
(
h1
,
vs
);
y1
=
_AVX512_FMA
(
y1
,
h1
,
_AVX512_MUL
(
x1
,
h2
));
y2
=
_AVX512_FMA
(
y2
,
h1
,
_AVX512_MUL
(
x2
,
h2
));
...
...
@@ -649,21 +683,35 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
__AVX512_DATATYPE
tau1
=
_AVX512_SET1
(
hh
[
0
]);
__AVX512_DATATYPE
tau2
=
_AVX512_SET1
(
hh
[
ldh
]);
__AVX512_DATATYPE
vs
=
_AVX512_SET1
(
s
);
// check for xor_pd on skylake
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__AVX512i
)
tau1
,
(
__AVX512i
)
sign
);
#endif
#ifdef SINGLE_PRECISION_REAL
h1
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__AVX512i
)
tau1
,
(
__AVX512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1
=
_AVX512_XOR
(
tau1
,
sign
);
#endif
#endif
x1
=
_AVX512_MUL
(
x1
,
h1
);
x2
=
_AVX512_MUL
(
x2
,
h1
);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__AVX512i
)
tau2
,
(
__AVX512i
)
sign
);
#endif
#ifdef SINGLE_PRECISION_REAL
h1
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__AVX512i
)
tau2
,
(
__AVX512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1
=
_AVX512_XOR
(
tau2
,
sign
);
#endif
#endif
h2
=
_AVX512_MUL
(
h1
,
vs
);
y1
=
_AVX512_FMA
(
y1
,
h1
,
_AVX512_MUL
(
x1
,
h2
));
...
...
@@ -771,21 +819,34 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
__AVX512_DATATYPE
tau1
=
_AVX512_SET1
(
hh
[
0
]);
__AVX512_DATATYPE
tau2
=
_AVX512_SET1
(
hh
[
ldh
]);
__AVX512_DATATYPE
vs
=
_AVX512_SET1
(
s
);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__AVX512i
)
tau1
,
(
__AVX512i
)
sign
);
#endif
#ifdef SINGLE_PRECISION_REAL
h1
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__AVX512i
)
tau1
,
(
__AVX512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1
=
_AVX512_XOR
(
tau1
,
sign
);
#endif
#endif
x1
=
_AVX512_MUL
(
x1
,
h1
);
x1
=
_AVX512_MUL
(
x1
,
h1
);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
h1
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__AVX512i
)
tau2
,
(
__AVX512i
)
sign
);
#endif
#ifdef SINGLE_PRECISION_REAL
h1
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__AVX512i
)
tau2
,
(
__AVX512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_REAL) || defined(SINGLE_PRECISION_REAL)
h1
=
_AVX512_XOR
(
tau2
,
sign
);
#endif
#endif
h2
=
_AVX512_MUL
(
h1
,
vs
);
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment