Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
39fbe8ba
Commit
39fbe8ba
authored
Jun 12, 2019
by
Andreas Marek
Browse files
Unify complex avx512 block2 kernel
parent
0d1da4da
Changes
7
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Doxyfile.in
View file @
39fbe8ba
...
...
@@ -932,7 +932,6 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_simple.F90 \
@top_srcdir@/src/elpa2/kernels/complex.F90 \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_double_precision.c \
...
...
Makefile.am
View file @
39fbe8ba
...
...
@@ -791,7 +791,6 @@ EXTRA_DIST = \
src/elpa2/elpa2_trans_ev_band_to_full_template.F90
\
src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90
\
src/elpa2/elpa2_tridiag_band_template.F90
\
src/elpa2/kernels/complex_avx512_2hv_template.c
\
src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c
\
src/elpa2/kernels/complex_template.F90
\
src/elpa2/kernels/real_vsx_4hv_template.c
\
...
...
ci_test_scripts/generate_gitlab_ci_tests.py
View file @
39fbe8ba
...
...
@@ -702,9 +702,9 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
print
(
"# "
+
cc
+
"-"
+
fc
+
"-"
+
m
+
"-"
+
o
+
"-"
+
p
+
"-"
+
a
+
"-"
+
b
+
"-"
+
g
+
"-"
+
cov
+
"-"
+
instr
+
"-"
+
addr
)
print
(
cc
+
"-"
+
fc
+
"-"
+
m
+
"-"
+
o
+
"-"
+
p
+
"-"
+
a
+
"-"
+
b
+
"-"
+
g
+
"-"
+
cov
+
"-"
+
instr
+
"-"
+
addr
+
"-jobs:"
)
if
(
MasterOnly
):
print
(
" only:"
)
print
(
" - /.*master.*/"
)
#
if (MasterOnly):
#
print(" only:")
#
print(" - /.*master.*/")
if
(
instr
==
"power8"
):
print
(
" allow_failure: true"
)
print
(
" tags:"
)
...
...
src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c
View file @
39fbe8ba
...
...
@@ -247,9 +247,11 @@
#endif
#define _SIMD_BROADCAST 1
#define _SIMD_SET1 _mm512_set1_pd
#define _SIMD_SET _mm512_set_pd
#define _SIMD_XOR_EPI _mm512_xor_epi64
#define _SIMD_ADDSUB 1
#define _SIMD_SHUFFLE _mm512_shuffle_pd
#define _SIMD_MASK_STOREU _mm512_mask_storeu_pd
#define _SHUFFLE 0x55
#ifdef HAVE_AVX512
...
...
@@ -277,8 +279,10 @@
#endif
#define _SIMD_BROADCAST 1
#define _SIMD_SET1 _mm512_set1_ps
#define _SIMD_SET _mm512_set_ps
#define _SIMD_ADDSUB 1
#define _SIMD_SHUFFLE _mm512_shuffle_ps
#define _SIMD_MASK_STOREU _mm512_mask_storeu_ps
#define _SIMD_XOR_EPI _mm512_xor_epi32
#define _SHUFFLE 0xb1
...
...
@@ -723,6 +727,37 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
!f>#endif
*/
/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f> subroutine double_hh_trafo_complex_AVX512_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_AVX512_2hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> ! complex(kind=c_double_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_double_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f> subroutine double_hh_trafo_complex_AVX512_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_AVX512_2hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> ! complex(kind=c_float_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_float_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
void
CONCAT_7ARGS
(
PREFIX
,
_hh_trafo_complex_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
#ifdef BLOCK1
)
...
...
@@ -1127,11 +1162,50 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#endif
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 16
#define STEP_SIZE 16
#define UPPER_BOUND 12
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 32
#define STEP_SIZE 32
#define UPPER_BOUND 24
#endif
#endif
/* VEC_SET == AVX_512 */
#if VEC_SET != AVX_512
for
(
i
=
0
;
i
<
nq
-
UPPER_BOUND
;
i
+=
STEP_SIZE
)
{
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
worked_on
+=
ROW_LENGTH
;
}
#endif
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
for
(
i
=
0
;
i
<
nq
-
12
;
i
+=
16
)
{
hh_trafo_complex_kernel_4_AVX512_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
hh_trafo_complex_kernel_4_AVX512_2hv_double
(
&
q
[
i
+
4
],
hh
,
nb
,
ldq
,
ldh
,
s
);
hh_trafo_complex_kernel_4_AVX512_2hv_double
(
&
q
[
i
+
8
],
hh
,
nb
,
ldq
,
ldh
,
s
);
hh_trafo_complex_kernel_4_AVX512_2hv_double
(
&
q
[
i
+
12
],
hh
,
nb
,
ldq
,
ldh
,
s
);
worked_on
+=
16
;
}
#endif
#ifdef SINGLE_PRECISION_COMPLEX
for
(
i
=
0
;
i
<
nq
-
24
;
i
+=
32
)
{
hh_trafo_complex_kernel_8_AVX512_2hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
hh_trafo_complex_kernel_8_AVX512_2hv_single
(
&
q
[
i
+
8
],
hh
,
nb
,
ldq
,
ldh
,
s
);
hh_trafo_complex_kernel_8_AVX512_2hv_single
(
&
q
[
i
+
16
],
hh
,
nb
,
ldq
,
ldh
,
s
);
hh_trafo_complex_kernel_8_AVX512_2hv_single
(
&
q
[
i
+
24
],
hh
,
nb
,
ldq
,
ldh
,
s
);
worked_on
+=
32
;
}
#endif
#endif
if
(
nq
==
i
)
{
...
...
@@ -1158,11 +1232,44 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#endif
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 12
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 24
#endif
#endif
/* VEC_SET == AVX_512 */
#if VEC_SET != AVX_512
if
(
nq
-
i
==
ROW_LENGTH
)
{
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
worked_on
+=
ROW_LENGTH
;
}
#endif
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
if
(
nq
-
i
==
12
)
{
hh_trafo_complex_kernel_4_AVX512_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
hh_trafo_complex_kernel_4_AVX512_2hv_double
(
&
q
[
i
+
4
],
hh
,
nb
,
ldq
,
ldh
,
s
);
hh_trafo_complex_kernel_4_AVX512_2hv_double
(
&
q
[
i
+
8
],
hh
,
nb
,
ldq
,
ldh
,
s
);
worked_on
+=
12
;
}
#endif
#ifdef SINGLE_PRECISION_COMPLEX
if
(
nq
-
i
==
24
)
{
hh_trafo_complex_kernel_8_AVX512_2hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
hh_trafo_complex_kernel_8_AVX512_2hv_single
(
&
q
[
i
+
8
],
hh
,
nb
,
ldq
,
ldh
,
s
);
hh_trafo_complex_kernel_8_AVX512_2hv_single
(
&
q
[
i
+
16
],
hh
,
nb
,
ldq
,
ldh
,
s
);
worked_on
+=
24
;
}
#endif
#endif
#if VEC_SET == SSE_128
#undef ROW_LENGTH
...
...
@@ -1184,12 +1291,42 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#endif
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 8
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 16
#endif
#endif
/* VEC_SET == AVX_512 */
#if VEC_SET != AVX_512
if
(
nq
-
i
==
ROW_LENGTH
)
{
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
worked_on
+=
ROW_LENGTH
;
}
#endif
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
if
(
nq
-
i
==
8
)
{
hh_trafo_complex_kernel_4_AVX512_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
hh_trafo_complex_kernel_4_AVX512_2hv_double
(
&
q
[
i
+
4
],
hh
,
nb
,
ldq
,
ldh
,
s
);
worked_on
+=
8
;
}
#endif
#ifdef SINGLE_PRECISION_COMPLEX
if
(
nq
-
i
==
16
)
{
hh_trafo_complex_kernel_8_AVX512_2hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
hh_trafo_complex_kernel_8_AVX512_2hv_single
(
&
q
[
i
+
8
],
hh
,
nb
,
ldq
,
ldh
,
s
);
worked_on
+=
16
;
}
#endif
#endif
#if VEC_SET == SSE_128
#undef ROW_LENGTH
...
...
@@ -1211,20 +1348,50 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#endif
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 8
#endif
#endif
/* VEC_SET == AVX_512 */
#if VEC_SET != AVX_512
if
(
nq
-
i
==
ROW_LENGTH
)
{
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
worked_on
+=
ROW_LENGTH
;
}
#endif
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
if
(
nq
-
i
==
4
)
{
hh_trafo_complex_kernel_4_AVX512_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
worked_on
+=
4
;
}
#endif
#ifdef SINGLE_PRECISION_COMPLEX
if
(
nq
-
i
==
8
)
{
hh_trafo_complex_kernel_8_AVX512_2hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
worked_on
+=
8
;
}
#endif
#endif
#endif
/* BLOCK2 */
#ifdef WITH_DEBUG
//
#ifdef WITH_DEBUG
if
(
worked_on
!=
nq
)
{
printf
(
"Error in complex SIMD_SET BLOCK BLOCK kernel %d %d
\n
"
,
worked_on
,
nq
);
abort
();
}
#endif
//
#endif
}
...
...
@@ -1300,7 +1467,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
__SIMD_DATATYPE
sign
=
(
__SIMD_DATATYPE
)
_mm512_set_epi64
(
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
);
__SIMD_DATATYPE
sign
=
(
__SIMD_DATATYPE
)
_mm512_set
1
_epi64
(
0x8000000000000000
);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
__SIMD_DATATYPE
sign
=
(
__SIMD_DATATYPE
)
_mm512_set1_epi32
(
0x80000000
);
...
...
@@ -1331,6 +1498,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h2_imag
=
_SIMD_BROADCAST
(
&
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h2_real
=
_SIMD_SET1
(
hh_dbl
[(
ldh
+
1
)
*
2
]);
h2_imag
=
_SIMD_SET1
(
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_512 */
#ifndef __ELPA_USE_FMA__
// conjugate
h2_imag
=
_SIMD_XOR
(
h2_imag
,
sign
);
...
...
@@ -1485,6 +1657,12 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h2_imag
=
_SIMD_BROADCAST
(
&
hh_dbl
[((
ldh
+
i
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h2_real
=
_SIMD_SET1
(
hh_dbl
[(
ldh
+
i
)
*
2
]);
h2_imag
=
_SIMD_SET1
(
hh_dbl
[((
ldh
+
i
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_512 */
#ifndef __ELPA_USE_FMA__
// conjugate
h2_imag
=
_SIMD_XOR
(
h2_imag
,
sign
);
...
...
@@ -1551,6 +1729,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h1_imag
=
_SIMD_BROADCAST
(
&
hh_dbl
[((
nb
-
1
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h1_real
=
_SIMD_SET1
(
hh_dbl
[(
nb
-
1
)
*
2
]);
h1_imag
=
_SIMD_SET1
(
hh_dbl
[((
nb
-
1
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_512 */
#ifndef __ELPA_USE_FMA__
// conjugate
h1_imag
=
_SIMD_XOR
(
h1_imag
,
sign
);
...
...
@@ -1643,7 +1826,6 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif
/* VEC_SET == AVX_512 */
#if VEC_SET != AVX_512
h1_real
=
_SIMD_XOR
(
h1_real
,
sign
);
h1_imag
=
_SIMD_XOR
(
h1_imag
,
sign
);
...
...
@@ -1716,10 +1898,49 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h2_imag
=
_SIMD_BROADCAST
(
&
hh_dbl
[(
ldh
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h1_real
=
_SIMD_SET1
(
hh_dbl
[
ldh
*
2
]);
h1_imag
=
_SIMD_SET1
(
hh_dbl
[(
ldh
*
2
)
+
1
]);
h2_real
=
_SIMD_SET1
(
hh_dbl
[
ldh
*
2
]);
h2_imag
=
_SIMD_SET1
(
hh_dbl
[(
ldh
*
2
)
+
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__SIMD_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__SIMD_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__SIMD_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__SIMD_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
h2_real
=
(
__SIMD_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h2_real
,
(
__m512i
)
sign
);
h2_imag
=
(
__SIMD_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h2_imag
,
(
__m512i
)
sign
);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h2_real
=
(
__SIMD_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_real
,
(
__m512i
)
sign
);
h2_imag
=
(
__SIMD_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_imag
,
(
__m512i
)
sign
);
#endif
#endif
/* HAVE_AVX512_XEON_PHI */
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_SIMD_XOR
(
h1_real
,
sign
);
h1_imag
=
_SIMD_XOR
(
h1_imag
,
sign
);
h2_real
=
_SIMD_XOR
(
h2_real
,
sign
);
h2_imag
=
_SIMD_XOR
(
h2_imag
,
sign
);
#endif
#endif
#endif
/* VEC_SET == AVX_512 */
#if VEC_SET != AVX_512
h1_real
=
_SIMD_XOR
(
h1_real
,
sign
);
h1_imag
=
_SIMD_XOR
(
h1_imag
,
sign
);
h2_real
=
_SIMD_XOR
(
h2_real
,
sign
);
h2_imag
=
_SIMD_XOR
(
h2_imag
,
sign
);
#endif
/* VEC_SET != AVX_512 */
#if VEC_SET == SSE_128
#ifdef SINGLE_PRECISION_COMPLEX
...
...
@@ -1739,6 +1960,18 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2
=
_SIMD_SET
(
s_dbl
[
1
],
s_dbl
[
0
],
s_dbl
[
1
],
s_dbl
[
0
],
s_dbl
[
1
],
s_dbl
[
0
],
s_dbl
[
1
],
s_dbl
[
0
]);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
tmp2
=
(
__SIMD_DATATYPE
)
_mm512_set1_pd
(
*
(
double
*
)(
&
s_dbl
[
0
]));
#endif
#endif
/* VEC_SET == AVX_512 */
tmp1
=
_SIMD_MUL
(
h2_imag
,
tmp2
);
#ifdef __ELPA_USE_FMA__
tmp2
=
_SIMD_FMADDSUB
(
h2_real
,
tmp2
,
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
));
...
...
@@ -1746,6 +1979,13 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
tmp2
=
_SIMD_ADDSUB
(
_SIMD_MUL
(
h2_real
,
tmp2
),
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
));
#endif
#if VEC_SET == AVX_512
_SIMD_MASK_STOREU
(
s_dbl
,
0x01
+
0x02
,
tmp2
);
h2_real
=
_SIMD_SET1
(
s_dbl
[
0
]);
h2_imag
=
_SIMD_SET1
(
s_dbl
[
1
]);
#endif
/* VEC_SET == AVX_512 */
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
h2_real
=
_mm_movedup_pd
(
tmp2
);
...
...
@@ -1894,6 +2134,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h2_imag
=
_SIMD_BROADCAST
(
&
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h2_real
=
_SIMD_SET1
(
hh_dbl
[(
ldh
+
1
)
*
2
]);
h2_imag
=
_SIMD_SET1
(
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_512 */
q1
=
_SIMD_LOAD
(
&
q_dbl
[(
ldq
*
2
)
+
0
]);
q2
=
_SIMD_LOAD
(
&
q_dbl
[(
ldq
*
2
)
+
offset
]);
q3
=
_SIMD_LOAD
(
&
q_dbl
[(
ldq
*
2
)
+
2
*
offset
]);
...
...
@@ -2044,6 +2289,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h2_imag
=
_SIMD_BROADCAST
(
&
hh_dbl
[((
ldh
+
i
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h2_real
=
_SIMD_SET1
(
hh_dbl
[(
ldh
+
i
)
*
2
]);
h2_imag
=
_SIMD_SET1
(
hh_dbl
[((
ldh
+
i
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_512 */
tmp1
=
_SIMD_MUL
(
h2_imag
,
y1
);
#ifdef __ELPA_USE_FMA__
q1
=
_SIMD_ADD
(
q1
,
_SIMD_FMADDSUB
(
h2_real
,
y1
,
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
...
...
@@ -2111,6 +2361,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h1_imag
=
_SIMD_BROADCAST
(
&
hh_dbl
[((
nb
-
1
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h1_real
=
_SIMD_SET1
(
hh_dbl
[(
nb
-
1
)
*
2
]);
h1_imag
=
_SIMD_SET1
(
hh_dbl
[((
nb
-
1
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_512 */
q1
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
0
]);
q2
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
offset
]);
q3
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
2
*
offset
]);
...
...
@@ -2240,7 +2495,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
__SIMD_DATATYPE
sign
=
(
__SIMD_DATATYPE
)
_mm512_set_epi64
(
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
);
__SIMD_DATATYPE
sign
=
(
__SIMD_DATATYPE
)
_mm512_set
1
_epi64
(
0x8000000000000000
);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
__SIMD_DATATYPE
sign
=
(
__SIMD_DATATYPE
)
_mm512_set1_epi32
(
0x80000000
);
...
...
@@ -2270,6 +2525,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h2_imag
=
_SIMD_BROADCAST
(
&
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h2_real
=
_SIMD_SET1
(
hh_dbl
[(
ldh
+
1
)
*
2
]);
h2_imag
=
_SIMD_SET1
(
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_512 */
#ifndef __ELPA_USE_FMA__
// conjugate
h2_imag
=
_SIMD_XOR
(
h2_imag
,
sign
);
...
...
@@ -2410,6 +2670,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h2_imag
=
_SIMD_BROADCAST
(
&
hh_dbl
[((
ldh
+
i
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h2_real
=
_SIMD_SET1
(
hh_dbl
[(
ldh
+
i
)
*
2
]);
h2_imag
=
_SIMD_SET1
(
hh_dbl
[((
ldh
+
i
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_512 */
#ifndef __ELPA_USE_FMA__
// conjugate
h2_imag
=
_SIMD_XOR
(
h2_imag
,
sign
);
...
...
@@ -2470,6 +2735,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h1_imag
=
_SIMD_BROADCAST
(
&
hh_dbl
[((
nb
-
1
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h1_real
=
_SIMD_SET1
(
hh_dbl
[(
nb
-
1
)
*
2
]);
h1_imag
=
_SIMD_SET1
(
hh_dbl
[((
nb
-
1
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_512 */
#ifndef __ELPA_USE_FMA__
// conjugate
h1_imag
=
_SIMD_XOR
(
h1_imag
,
sign
);
...
...
@@ -2621,10 +2891,49 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h2_imag
=
_SIMD_BROADCAST
(
&
hh_dbl
[(
ldh
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h1_real
=
_SIMD_SET1
(
hh_dbl
[
ldh
*
2
]);
h1_imag
=
_SIMD_SET1
(
hh_dbl
[(
ldh
*
2
)
+
1
]);
h2_real
=
_SIMD_SET1
(
hh_dbl
[
ldh
*
2
]);
h2_imag
=
_SIMD_SET1
(
hh_dbl
[(
ldh
*
2
)
+
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__SIMD_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__SIMD_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__SIMD_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__SIMD_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
h2_real
=
(
__SIMD_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h2_real
,
(
__m512i
)
sign
);
h2_imag
=
(
__SIMD_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h2_imag
,
(
__m512i
)
sign
);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h2_real
=
(
__SIMD_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_real
,
(
__m512i
)
sign
);
h2_imag
=
(
__SIMD_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_imag
,
(
__m512i
)
sign
);
#endif
#endif
/* HAVE_AVX512_XEON_PHI */
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_SIMD_XOR
(
h1_real
,
sign
);
h1_imag
=
_SIMD_XOR
(
h1_imag
,
sign
);
h2_real
=
_SIMD_XOR
(
h2_real
,
sign
);
h2_imag
=
_SIMD_XOR
(
h2_imag
,
sign
);
#endif
#endif
#endif
/* VEC_SET == AVX_512 */
#if VEC_SET != AVX_512
h1_real
=
_SIMD_XOR
(
h1_real
,
sign
);
h1_imag
=
_SIMD_XOR
(
h1_imag
,
sign
);
h2_real
=
_SIMD_XOR
(
h2_real
,
sign
);
h2_imag
=
_SIMD_XOR
(
h2_imag
,
sign
);
#endif
/* VEC_SET != AVX_512 */
#if VEC_SET == SSE_128
#ifdef SINGLE_PRECISION_COMPLEX
...
...
@@ -2644,6 +2953,18 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2
=
_SIMD_SET
(
s_dbl
[
1
],
s_dbl
[
0
],
s_dbl
[
1
],
s_dbl
[
0
],
s_dbl
[
1
],
s_dbl
[
0
],
s_dbl
[
1
],
s_dbl
[
0
]);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
tmp2
=
(
__SIMD_DATATYPE
)
_mm512_set1_pd
(
*
(
double
*
)(
&
s_dbl
[
0
]));
#endif
#endif
/* VEC_SET == AVX_512 */
tmp1
=
_SIMD_MUL
(
h2_imag
,
tmp2
);
#ifdef __ELPA_USE_FMA__
tmp2
=
_SIMD_FMADDSUB
(
h2_real
,
tmp2
,
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
));
...
...
@@ -2651,6 +2972,13 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
tmp2
=
_SIMD_ADDSUB
(
_SIMD_MUL
(
h2_real
,
tmp2
),
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
));
#endif
#if VEC_SET == AVX_512
_SIMD_MASK_STOREU
(
s_dbl
,
0x01
+
0x02
,
tmp2
);
h2_real
=
_SIMD_SET1
(
s_dbl
[
0
]);
h2_imag
=
_SIMD_SET1
(
s_dbl
[
1
]);
#endif
/* VEC_SET == AVX_512 */
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
h2_real
=
_mm_movedup_pd
(
tmp2
);
...
...
@@ -2666,7 +2994,6 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h2_real
=
_SIMD_SET1
(
tmp2
[
0
]);
h2_imag
=
_SIMD_SET1
(
tmp2
[
1
]);
#endif
/* VEC_SET == AVX_256 */
tmp1
=
_SIMD_MUL
(
h1_imag
,
y1
);
#ifdef __ELPA_USE_FMA__
y1
=
_SIMD_FMADDSUB
(
h1_real
,
y1
,
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
));
...
...
@@ -2782,6 +3109,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h2_imag
=
_SIMD_BROADCAST
(
&
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h2_real
=
_SIMD_SET1
(
hh_dbl
[(
ldh
+
1
)
*
2
]);
h2_imag
=
_SIMD_SET1
(
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_512 */
q1
=
_SIMD_LOAD
(
&
q_dbl
[(
ldq
*
2
)
+
0
]);