Commit 39fbe8ba authored by Andreas Marek's avatar Andreas Marek
Browse files

Unify complex avx512 block2 kernel

parent 0d1da4da
......@@ -932,7 +932,6 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_simple.F90 \
@top_srcdir@/src/elpa2/kernels/complex.F90 \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_double_precision.c \
......
......@@ -791,7 +791,6 @@ EXTRA_DIST = \
src/elpa2/elpa2_trans_ev_band_to_full_template.F90 \
src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 \
src/elpa2/elpa2_tridiag_band_template.F90 \
src/elpa2/kernels/complex_avx512_2hv_template.c \
src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c \
src/elpa2/kernels/complex_template.F90 \
src/elpa2/kernels/real_vsx_4hv_template.c \
......
......@@ -702,9 +702,9 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
print("# " + cc + "-" + fc + "-" + m + "-" + o + "-" + p + "-" + a + "-" + b + "-" +g + "-" + cov + "-" + instr + "-" + addr)
print(cc + "-" + fc + "-" + m + "-" + o + "-" + p + "-" +a + "-" +b + "-" +g + "-" + cov + "-" + instr + "-" + addr + "-jobs:")
if (MasterOnly):
print(" only:")
print(" - /.*master.*/")
#if (MasterOnly):
# print(" only:")
# print(" - /.*master.*/")
if (instr == "power8"):
print(" allow_failure: true")
print(" tags:")
......
......@@ -247,9 +247,11 @@
#endif
#define _SIMD_BROADCAST 1
#define _SIMD_SET1 _mm512_set1_pd
#define _SIMD_SET _mm512_set_pd
#define _SIMD_XOR_EPI _mm512_xor_epi64
#define _SIMD_ADDSUB 1
#define _SIMD_SHUFFLE _mm512_shuffle_pd
#define _SIMD_MASK_STOREU _mm512_mask_storeu_pd
#define _SHUFFLE 0x55
#ifdef HAVE_AVX512
......@@ -277,8 +279,10 @@
#endif
#define _SIMD_BROADCAST 1
#define _SIMD_SET1 _mm512_set1_ps
#define _SIMD_SET _mm512_set_ps
#define _SIMD_ADDSUB 1
#define _SIMD_SHUFFLE _mm512_shuffle_ps
#define _SIMD_MASK_STOREU _mm512_mask_storeu_ps
#define _SIMD_XOR_EPI _mm512_xor_epi32
#define _SHUFFLE 0xb1
......@@ -723,6 +727,37 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
!f>#endif
*/
/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f> subroutine double_hh_trafo_complex_AVX512_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_AVX512_2hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> ! complex(kind=c_double_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_double_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f> subroutine double_hh_trafo_complex_AVX512_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_AVX512_2hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> ! complex(kind=c_float_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_float_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int* pnb, int* pnq, int* pldq
#ifdef BLOCK1
)
......@@ -1127,11 +1162,50 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#endif
#endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 16
#define STEP_SIZE 16
#define UPPER_BOUND 12
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 32
#define STEP_SIZE 32
#define UPPER_BOUND 24
#endif
#endif /* VEC_SET == AVX_512 */
#if VEC_SET != AVX_512
for (i = 0; i < nq - UPPER_BOUND; i+=STEP_SIZE)
{
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
worked_on +=ROW_LENGTH;
}
#endif
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
for (i = 0; i < nq - 12; i+=16)
{
hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i+4], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i+8], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i+12], hh, nb, ldq, ldh, s);
worked_on +=16;
}
#endif
#ifdef SINGLE_PRECISION_COMPLEX
for (i = 0; i < nq - 24; i+=32)
{
hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i+8], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i+16], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i+24], hh, nb, ldq, ldh, s);
worked_on +=32;
}
#endif
#endif
if (nq == i)
{
......@@ -1158,11 +1232,44 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#endif
#endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 12
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 24
#endif
#endif /* VEC_SET == AVX_512 */
#if VEC_SET != AVX_512
if (nq-i == ROW_LENGTH)
{
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
worked_on += ROW_LENGTH;
}
#endif
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
if (nq-i == 12)
{
hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i+4], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i+8], hh, nb, ldq, ldh, s);
worked_on +=12;
}
#endif
#ifdef SINGLE_PRECISION_COMPLEX
if (nq-i == 24)
{
hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i+8], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i+16], hh, nb, ldq, ldh, s);
worked_on +=24;
}
#endif
#endif
#if VEC_SET == SSE_128
#undef ROW_LENGTH
......@@ -1184,12 +1291,42 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#endif
#endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 8
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 16
#endif
#endif /* VEC_SET == AVX_512 */
#if VEC_SET != AVX_512
if (nq-i == ROW_LENGTH)
{
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
worked_on += ROW_LENGTH;
}
#endif
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
if (nq-i == 8)
{
hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i+4], hh, nb, ldq, ldh, s);
worked_on +=8;
}
#endif
#ifdef SINGLE_PRECISION_COMPLEX
if (nq-i == 16)
{
hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i+8], hh, nb, ldq, ldh, s);
worked_on +=16;
}
#endif
#endif
#if VEC_SET == SSE_128
#undef ROW_LENGTH
......@@ -1211,20 +1348,50 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#endif
#endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 8
#endif
#endif /* VEC_SET == AVX_512 */
#if VEC_SET != AVX_512
if (nq-i == ROW_LENGTH)
{
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
worked_on += ROW_LENGTH;
}
#endif
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
if (nq-i == 4)
{
hh_trafo_complex_kernel_4_AVX512_2hv_double (&q[i], hh, nb, ldq, ldh, s);
worked_on +=4;
}
#endif
#ifdef SINGLE_PRECISION_COMPLEX
if (nq-i == 8)
{
hh_trafo_complex_kernel_8_AVX512_2hv_single (&q[i], hh, nb, ldq, ldh, s);
worked_on +=8;
}
#endif
#endif
#endif /* BLOCK2 */
#ifdef WITH_DEBUG
//#ifdef WITH_DEBUG
if (worked_on != nq)
{
printf("Error in complex SIMD_SET BLOCK BLOCK kernel %d %d\n", worked_on, nq);
abort();
}
#endif
//#endif
}
......@@ -1300,7 +1467,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
__SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
__SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
__SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
......@@ -1331,6 +1498,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
#endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
#endif /* VEC_SET == AVX_512 */
#ifndef __ELPA_USE_FMA__
// conjugate
h2_imag = _SIMD_XOR(h2_imag, sign);
......@@ -1485,6 +1657,12 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
#endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
#endif /* VEC_SET == AVX_512 */
#ifndef __ELPA_USE_FMA__
// conjugate
h2_imag = _SIMD_XOR(h2_imag, sign);
......@@ -1551,6 +1729,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
#endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
#endif /* VEC_SET == AVX_512 */
#ifndef __ELPA_USE_FMA__
// conjugate
h1_imag = _SIMD_XOR(h1_imag, sign);
......@@ -1643,7 +1826,6 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif /* VEC_SET == AVX_512 */
#if VEC_SET != AVX_512
h1_real = _SIMD_XOR(h1_real, sign);
h1_imag = _SIMD_XOR(h1_imag, sign);
......@@ -1716,10 +1898,49 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h2_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
#endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h1_real = _SIMD_SET1(hh_dbl[ldh*2]);
h1_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
h2_real = _SIMD_SET1(hh_dbl[ldh*2]);
h2_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
h2_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h2_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
#endif
#endif /* HAVE_AVX512_XEON_PHI */
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _SIMD_XOR(h1_real, sign);
h1_imag = _SIMD_XOR(h1_imag, sign);
h2_real = _SIMD_XOR(h2_real, sign);
h2_imag = _SIMD_XOR(h2_imag, sign);
#endif
#endif
#endif /* VEC_SET == AVX_512 */
#if VEC_SET != AVX_512
h1_real = _SIMD_XOR(h1_real, sign);
h1_imag = _SIMD_XOR(h1_imag, sign);
h2_real = _SIMD_XOR(h2_real, sign);
h2_imag = _SIMD_XOR(h2_imag, sign);
#endif /* VEC_SET != AVX_512 */
#if VEC_SET == SSE_128
#ifdef SINGLE_PRECISION_COMPLEX
......@@ -1739,6 +1960,18 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif
#endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2 = _SIMD_SET(s_dbl[1], s_dbl[0],
s_dbl[1], s_dbl[0],
s_dbl[1], s_dbl[0],
s_dbl[1], s_dbl[0]);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
tmp2 = (__SIMD_DATATYPE) _mm512_set1_pd(*(double*)(&s_dbl[0]));
#endif
#endif /* VEC_SET == AVX_512 */
tmp1 = _SIMD_MUL(h2_imag, tmp2);
#ifdef __ELPA_USE_FMA__
tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
......@@ -1746,6 +1979,13 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif
#if VEC_SET == AVX_512
_SIMD_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
h2_real = _SIMD_SET1(s_dbl[0]);
h2_imag = _SIMD_SET1(s_dbl[1]);
#endif /* VEC_SET == AVX_512 */
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
h2_real = _mm_movedup_pd(tmp2);
......@@ -1894,6 +2134,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
#endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
#endif /* VEC_SET == AVX_512 */
q1 = _SIMD_LOAD(&q_dbl[(ldq*2)+0]);
q2 = _SIMD_LOAD(&q_dbl[(ldq*2)+offset]);
q3 = _SIMD_LOAD(&q_dbl[(ldq*2)+2*offset]);
......@@ -2044,6 +2289,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
#endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
#endif /* VEC_SET == AVX_512 */
tmp1 = _SIMD_MUL(h2_imag, y1);
#ifdef __ELPA_USE_FMA__
q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
......@@ -2111,6 +2361,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
#endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
#endif /* VEC_SET == AVX_512 */
q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
......@@ -2240,7 +2495,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
__SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
__SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
__SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
......@@ -2270,6 +2525,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
#endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
#endif /* VEC_SET == AVX_512 */
#ifndef __ELPA_USE_FMA__
// conjugate
h2_imag = _SIMD_XOR(h2_imag, sign);
......@@ -2410,6 +2670,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
#endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
#endif /* VEC_SET == AVX_512 */
#ifndef __ELPA_USE_FMA__
// conjugate
h2_imag = _SIMD_XOR(h2_imag, sign);
......@@ -2470,6 +2735,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
#endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
#endif /* VEC_SET == AVX_512 */
#ifndef __ELPA_USE_FMA__
// conjugate
h1_imag = _SIMD_XOR(h1_imag, sign);
......@@ -2621,10 +2891,49 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h2_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
#endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h1_real = _SIMD_SET1(hh_dbl[ldh*2]);
h1_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
h2_real = _SIMD_SET1(hh_dbl[ldh*2]);
h2_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
h2_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h2_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
#endif
#endif /* HAVE_AVX512_XEON_PHI */
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _SIMD_XOR(h1_real, sign);
h1_imag = _SIMD_XOR(h1_imag, sign);
h2_real = _SIMD_XOR(h2_real, sign);
h2_imag = _SIMD_XOR(h2_imag, sign);
#endif
#endif
#endif /* VEC_SET == AVX_512 */
#if VEC_SET != AVX_512
h1_real = _SIMD_XOR(h1_real, sign);
h1_imag = _SIMD_XOR(h1_imag, sign);
h2_real = _SIMD_XOR(h2_real, sign);
h2_imag = _SIMD_XOR(h2_imag, sign);
#endif /* VEC_SET != AVX_512 */
#if VEC_SET == SSE_128
#ifdef SINGLE_PRECISION_COMPLEX
......@@ -2644,6 +2953,18 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif
#endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2 = _SIMD_SET(s_dbl[1], s_dbl[0],
s_dbl[1], s_dbl[0],
s_dbl[1], s_dbl[0],
s_dbl[1], s_dbl[0]);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
tmp2 = (__SIMD_DATATYPE) _mm512_set1_pd(*(double*)(&s_dbl[0]));
#endif
#endif /* VEC_SET == AVX_512 */
tmp1 = _SIMD_MUL(h2_imag, tmp2);
#ifdef __ELPA_USE_FMA__
tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
......@@ -2651,6 +2972,13 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif
#if VEC_SET == AVX_512
_SIMD_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
h2_real = _SIMD_SET1(s_dbl[0]);
h2_imag = _SIMD_SET1(s_dbl[1]);
#endif /* VEC_SET == AVX_512 */
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
h2_real = _mm_movedup_pd(tmp2);
......@@ -2666,7 +2994,6 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h2_real = _SIMD_SET1(tmp2[0]);
h2_imag = _SIMD_SET1(tmp2[1]);
#endif /* VEC_SET == AVX_256 */
tmp1 = _SIMD_MUL(h1_imag, y1);
#ifdef __ELPA_USE_FMA__
y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
......@@ -2782,6 +3109,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
#endif /* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
#endif /* VEC_SET == AVX_512 */