Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
elpa
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
15
Issues
15
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Environments
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
elpa
elpa
Commits
8596c33e
Commit
8596c33e
authored
Jan 10, 2018
by
Andreas Marek
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'Skylake' into master_pre_stage
parents
50ab7a76
9516f944
Changes
4
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
431 additions
and
201 deletions
+431
-201
configure.ac
configure.ac
+39
-0
src/elpa2/kernels/complex_avx512_1hv_template.c
src/elpa2/kernels/complex_avx512_1hv_template.c
+56
-4
src/elpa2/kernels/complex_avx512_2hv_template.c
src/elpa2/kernels/complex_avx512_2hv_template.c
+81
-5
src/elpa2/kernels/real_avx512_2hv_template.c
src/elpa2/kernels/real_avx512_2hv_template.c
+255
-192
No files found.
configure.ac
View file @
8596c33e
...
@@ -851,6 +851,45 @@ if test x"${need_avx512}" = x"yes"; then
...
@@ -851,6 +851,45 @@ if test x"${need_avx512}" = x"yes"; then
AC_MSG_ERROR([Could not compile a test program with AVX512, adjust the C compiler or CFLAGS])
AC_MSG_ERROR([Could not compile a test program with AVX512, adjust the C compiler or CFLAGS])
fi
fi
AC_DEFINE([HAVE_AVX512],[1],[AVX512 is supported on this CPU])
AC_DEFINE([HAVE_AVX512],[1],[AVX512 is supported on this CPU])
if test x"$can_compile_avx512" = x"yes"; then
AC_MSG_CHECKING([whether we compile for Xeon PHI or Xeon])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
__m512d sign;
__m512d h1;
__m512d x1 = _mm512_xor_pd(h1_real, sign);
return 0;
}
])],
[can_compile_avx512_xeon=yes],
[can_compile_avx512_xeon=no]
)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
__m512d sign;
__m512d h1;
__m512d x1 = (__mm512d) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
return 0;
}
])],
[can_compile_avx512_xeon_phi=yes],
[can_compile_avx512_xeon_phi=no]
)
if test x"$can_compile_avx512_xeon" = x"yes"; then
AC_DEFINE([HAVE_AVX512_XEON],[1],[AVX512 for Xeon is supported on this CPU])
else
if test x"$can_compile_avx512_xeon_phi" = x"yes"; then
AC_DEFINE([HAVE_AVX512_XEON_PHI],[1],[AVX512 for Xeon-PHI is supported on this CPU])
else
AC_MSG_ERROR([Oho! We can neither compile AVX512 intrinsics for Xeon nor Xeon Phi. This should not happen!])
fi
fi
fi
fi
fi
AC_LANG_POP([C])
AC_LANG_POP([C])
...
...
src/elpa2/kernels/complex_avx512_1hv_template.c
View file @
8596c33e
...
@@ -63,7 +63,9 @@
...
@@ -63,7 +63,9 @@
#define _AVX512_MUL _mm512_mul_pd
#define _AVX512_MUL _mm512_mul_pd
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_SHUFFLE _mm512_shuffle_pd
#define _AVX512_SHUFFLE _mm512_shuffle_pd
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_pd
#define _AVX512_XOR _mm512_xor_pd
#endif
#define _AVX512_XOR_EPI _mm512_xor_epi64
#define _AVX512_XOR_EPI _mm512_xor_epi64
#define _SHUFFLE 0x55
#define _SHUFFLE 0x55
...
@@ -87,7 +89,9 @@
...
@@ -87,7 +89,9 @@
#define _AVX512_MUL _mm512_mul_ps
#define _AVX512_MUL _mm512_mul_ps
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_SHUFFLE _mm512_shuffle_ps
#define _AVX512_SHUFFLE _mm512_shuffle_ps
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_ps
#define _AVX512_XOR _mm512_xor_ps
#endif
#define _AVX512_XOR_EPI _mm512_xor_epi32
#define _AVX512_XOR_EPI _mm512_xor_epi32
#define _SHUFFLE 0xb1
#define _SHUFFLE 0xb1
...
@@ -361,6 +365,7 @@ static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float com
...
@@ -361,6 +365,7 @@ static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
@@ -368,6 +373,13 @@ static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float com
...
@@ -368,6 +373,13 @@ static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
@@ -539,6 +551,7 @@ static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float com
...
@@ -539,6 +551,7 @@ static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
@@ -546,6 +559,13 @@ static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float com
...
@@ -546,6 +559,13 @@ static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
@@ -688,9 +708,10 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float com
...
@@ -688,9 +708,10 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float com
x4
=
_AVX512_ADD
(
x4
,
_AVX512_FMSUBADD
(
h1_real
,
q4
,
_AVX512_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
x4
=
_AVX512_ADD
(
x4
,
_AVX512_FMSUBADD
(
h1_real
,
q4
,
_AVX512_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
}
}
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
@@ -698,6 +719,13 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float com
...
@@ -698,6 +719,13 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
@@ -824,6 +852,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float com
...
@@ -824,6 +852,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
@@ -831,6 +860,13 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float com
...
@@ -831,6 +860,13 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
@@ -934,6 +970,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float com
...
@@ -934,6 +970,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
@@ -941,6 +978,13 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float com
...
@@ -941,6 +978,13 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
@@ -1022,9 +1066,10 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float comp
...
@@ -1022,9 +1066,10 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float comp
x1
=
_AVX512_ADD
(
x1
,
_AVX512_FMSUBADD
(
h1_real
,
q1
,
_AVX512_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
x1
=
_AVX512_ADD
(
x1
,
_AVX512_FMSUBADD
(
h1_real
,
q1
,
_AVX512_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
}
}
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
@@ -1032,6 +1077,13 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float comp
...
@@ -1032,6 +1077,13 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float comp
#ifdef SINGLE_PRECISION_COMPLEX
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
...
src/elpa2/kernels/complex_avx512_2hv_template.c
View file @
8596c33e
/
/ This file is part of ELPA.
XEON_PHI
/
This
file
is
part
of
ELPA
.
//
//
// The ELPA library was originally created by the ELPA consortium,
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
// consisting of the following organizations:
...
@@ -65,6 +65,9 @@
...
@@ -65,6 +65,9 @@
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_MASK_STOREU _mm512_mask_storeu_pd
#define _AVX512_MASK_STOREU _mm512_mask_storeu_pd
#define _AVX512_SHUFFLE _mm512_shuffle_pd
#define _AVX512_SHUFFLE _mm512_shuffle_pd
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_pd
#endif
#define _SHUFFLE 0x55
#define _SHUFFLE 0x55
#ifdef HAVE_AVX512
#ifdef HAVE_AVX512
...
@@ -90,6 +93,9 @@
...
@@ -90,6 +93,9 @@
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_MASK_STOREU _mm512_mask_storeu_ps
#define _AVX512_MASK_STOREU _mm512_mask_storeu_ps
#define _AVX512_SHUFFLE _mm512_shuffle_ps
#define _AVX512_SHUFFLE _mm512_shuffle_ps
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_ps
#endif
#define _SHUFFLE 0xb1
#define _SHUFFLE 0xb1
#ifdef HAVE_AVX512
#ifdef HAVE_AVX512
...
@@ -380,6 +386,7 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
...
@@ -380,6 +386,7 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
@@ -388,7 +395,14 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
...
@@ -388,7 +395,14 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
x1
=
_AVX512_FMADDSUB
(
h1_real
,
x1
,
_AVX512_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
));
x1
=
_AVX512_FMADDSUB
(
h1_real
,
x1
,
_AVX512_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
));
...
@@ -409,6 +423,7 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
...
@@ -409,6 +423,7 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h2_real
=
_AVX512_SET1
(
hh_dbl
[
ldh
*
2
]);
h2_real
=
_AVX512_SET1
(
hh_dbl
[
ldh
*
2
]);
h2_imag
=
_AVX512_SET1
(
hh_dbl
[(
ldh
*
2
)
+
1
]);
h2_imag
=
_AVX512_SET1
(
hh_dbl
[(
ldh
*
2
)
+
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
@@ -426,6 +441,15 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
...
@@ -426,6 +441,15 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h2_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_real
,
(
__m512i
)
sign
);
h2_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_real
,
(
__m512i
)
sign
);
h2_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_imag
,
(
__m512i
)
sign
);
h2_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_imag
,
(
__m512i
)
sign
);
#endif
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
h2_real
=
_AVX512_XOR
(
h2_real
,
sign
);
h2_imag
=
_AVX512_XOR
(
h2_imag
,
sign
);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2
=
_AVX512_SET
(
s_dbl
[
1
],
s_dbl
[
0
],
tmp2
=
_AVX512_SET
(
s_dbl
[
1
],
s_dbl
[
0
],
...
@@ -723,6 +747,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
...
@@ -723,6 +747,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
@@ -731,7 +756,14 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
...
@@ -731,7 +756,14 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
x1
=
_AVX512_FMADDSUB
(
h1_real
,
x1
,
_AVX512_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
));
x1
=
_AVX512_FMADDSUB
(
h1_real
,
x1
,
_AVX512_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
));
...
@@ -748,6 +780,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
...
@@ -748,6 +780,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h2_real
=
_AVX512_SET1
(
hh_dbl
[
ldh
*
2
]);
h2_real
=
_AVX512_SET1
(
hh_dbl
[
ldh
*
2
]);
h2_imag
=
_AVX512_SET1
(
hh_dbl
[(
ldh
*
2
)
+
1
]);
h2_imag
=
_AVX512_SET1
(
hh_dbl
[(
ldh
*
2
)
+
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
@@ -765,6 +798,15 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
...
@@ -765,6 +798,15 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h2_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_real
,
(
__m512i
)
sign
);
h2_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_real
,
(
__m512i
)
sign
);
h2_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_imag
,
(
__m512i
)
sign
);
h2_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_imag
,
(
__m512i
)
sign
);
#endif
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
h2_real
=
_AVX512_XOR
(
h2_real
,
sign
);
h2_imag
=
_AVX512_XOR
(
h2_imag
,
sign
);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2
=
_AVX512_SET
(
s_dbl
[
1
],
s_dbl
[
0
],
tmp2
=
_AVX512_SET
(
s_dbl
[
1
],
s_dbl
[
0
],
...
@@ -1008,6 +1050,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
...
@@ -1008,6 +1050,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
@@ -1016,7 +1059,13 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
...
@@ -1016,7 +1059,13 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
@@ -1031,6 +1080,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
...
@@ -1031,6 +1080,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h2_real
=
_AVX512_SET1
(
hh_dbl
[
ldh
*
2
]);
h2_real
=
_AVX512_SET1
(
hh_dbl
[
ldh
*
2
]);
h2_imag
=
_AVX512_SET1
(
hh_dbl
[(
ldh
*
2
)
+
1
]);
h2_imag
=
_AVX512_SET1
(
hh_dbl
[(
ldh
*
2
)
+
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
@@ -1048,6 +1098,15 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
...
@@ -1048,6 +1098,15 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h2_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_real
,
(
__m512i
)
sign
);
h2_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_real
,
(
__m512i
)
sign
);
h2_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_imag
,
(
__m512i
)
sign
);
h2_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_imag
,
(
__m512i
)
sign
);
#endif
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
h2_real
=
_AVX512_XOR
(
h2_real
,
sign
);
h2_imag
=
_AVX512_XOR
(
h2_imag
,
sign
);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2
=
_AVX512_SET
(
s_dbl
[
1
],
s_dbl
[
0
],
tmp2
=
_AVX512_SET
(
s_dbl
[
1
],
s_dbl
[
0
],
...
@@ -1238,6 +1297,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
...
@@ -1238,6 +1297,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
@@ -1246,7 +1306,13 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
...
@@ -1246,7 +1306,13 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
@@ -1257,6 +1323,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
...
@@ -1257,6 +1323,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h2_real
=
_AVX512_SET1
(
hh_dbl
[
ldh
*
2
]);
h2_real
=
_AVX512_SET1
(
hh_dbl
[
ldh
*
2
]);
h2_imag
=
_AVX512_SET1
(
hh_dbl
[(
ldh
*
2
)
+
1
]);
h2_imag
=
_AVX512_SET1
(
hh_dbl
[(
ldh
*
2
)
+
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
@@ -1274,6 +1341,15 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
...
@@ -1274,6 +1341,15 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h2_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_real
,
(
__m512i
)
sign
);
h2_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_real
,
(
__m512i
)
sign
);
h2_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_imag
,
(
__m512i
)
sign
);
h2_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_imag
,
(
__m512i
)
sign
);
#endif
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
h2_real
=
_AVX512_XOR
(
h2_real
,
sign
);
h2_imag
=
_AVX512_XOR
(
h2_imag
,
sign
);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2
=
_AVX512_SET
(
s_dbl
[
1
],
s_dbl
[
0
],
tmp2
=
_AVX512_SET
(
s_dbl
[
1
],
s_dbl
[
0
],
...
...
src/elpa2/kernels/real_avx512_2hv_template.c
View file @
8596c33e
...
@@ -63,6 +63,9 @@
...
@@ -63,6 +63,9 @@
#define _AVX512_SET1 _mm512_set1_pd
#define _AVX512_SET1 _mm512_set1_pd
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_MUL _mm512_mul_pd
#define _AVX512_MUL _mm512_mul_pd
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_pd
#endif
#ifdef HAVE_AVX512
#ifdef HAVE_AVX512
#define __ELPA_USE_FMA__
#define __ELPA_USE_FMA__
...
@@ -82,6 +85,9 @@
...
@@ -82,6 +85,9 @@
#define _AVX512_SET1 _mm512_set1_ps
#define _AVX512_SET1 _mm512_set1_ps
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_MUL _mm512_mul_ps
#define _AVX512_MUL _mm512_mul_ps
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_ps
#endif
#ifdef HAVE_AVX512
#ifdef HAVE_AVX512
#define __ELPA_USE_FMA__
#define __ELPA_USE_FMA__
...
@@ -332,96 +338,110 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
...
@@ -332,96 +338,110 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
__AVX512_DATATYPE
tau2
=
_AVX512_SET1
(
hh
[
ldh
]);
__AVX512_DATATYPE
tau2
=
_AVX512_SET1
(
hh
[
ldh
]);
__AVX512_DATATYPE
vs
=
_AVX512_SET1
(
s
);
__AVX512_DATATYPE
vs
=
_AVX512_SET1
(
s
);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_REAL
#ifdef DOUBLE_PRECISION_REAL
h1
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__AVX512i
)
tau1
,
(
__AVX512i
)
sign
);
h1
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__AVX512i
)
tau1
,
(
__AVX512i
)
sign
);
#endif
#endif
#ifdef SINGLE_PRECISION_REAL
#ifdef SINGLE_PRECISION_REAL
h1
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__AVX512i
)
tau1
,
(
__AVX512i
)
sign
);
h1
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__AVX512i
)
tau1
,
(
__AVX512i
)
sign
);
#endif
#endif
x1
=
_AVX512_MUL
(
x1
,
h1
);