Commit 4b618c06 authored by Andreas Marek's avatar Andreas Marek
Browse files

Unify complex AVX block1 kernel

parent 53eb97d4
......@@ -893,8 +893,6 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa2/elpa2_trans_ev_band_to_full_template.F90 \
@top_srcdir@/src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 \
@top_srcdir@/src/elpa2/kernels/simple_template.F90 \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_1hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_template.F90 \
@top_srcdir@/src/elpa2/kernels/complex_sse_1hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_6hv_single_precision.c \
......@@ -937,7 +935,6 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_simple.F90 \
@top_srcdir@/src/elpa2/kernels/complex.F90 \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_2hv_double_precision.c \
......
......@@ -791,8 +791,6 @@ EXTRA_DIST = \
src/elpa2/elpa2_trans_ev_band_to_full_template.F90 \
src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 \
src/elpa2/elpa2_tridiag_band_template.F90 \
src/elpa2/kernels/complex_avx-avx2_1hv_template.c \
src/elpa2/kernels/complex_avx-avx2_2hv_template.c \
src/elpa2/kernels/complex_avx512_1hv_template.c \
src/elpa2/kernels/complex_avx512_2hv_template.c \
src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c \
......
......@@ -116,11 +116,15 @@
#define _SIMD_MUL _mm_mul_pd
#define _SIMD_ADD _mm_add_pd
#define _SIMD_XOR _mm_xor_pd
#define _SIMD_MADDSUB _mm_maddsub_pd
#define _SIMD_ADDSUB _mm_addsub_pd
#define _SIMD_SHUFFLE _mm_shuffle_pd
#define _SHUFFLE _MM_SHUFFLE2(0,1)
#ifdef __ELPA_USE_FMA__
#define _SIMD_FMSUBADD _mm_maddsub_pd
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */
#ifdef SINGLE_PRECISION_COMPLEX
#define offset 4
#define __SIMD_DATATYPE __m128
......@@ -131,12 +135,16 @@
#define _SIMD_MUL _mm_mul_ps
#define _SIMD_ADD _mm_add_ps
#define _SIMD_XOR _mm_xor_ps
#define _SIMD_MADDSUB _mm_maddsub_ps
#define _SIMD_ADDSUB _mm_addsub_ps
#define _SIMD_SHUFFLE _mm_shuffle_ps
#define _SHUFFLE 0xb1
#ifdef __ELPA_USE_FMA__
#define _SIMD_FMSUBADD _mm_maddsub_ps
#endif
#endif /* SINGLE_PRECISION_COMPLEX */
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
......@@ -152,7 +160,7 @@
#define _SIMD_ADD _mm256_add_pd
#define _SIMD_XOR _mm256_xor_pd
#define _SIMD_BROADCAST _mm256_broadcast_sd
#define _SIMD_MADDSUB 1
#define _SIMD_SET1 _mm256_set1_pd
#define _SIMD_ADDSUB _mm256_addsub_pd
#define _SIMD_SHUFFLE _mm256_shuffle_pd
#define _SHUFFLE 0x5
......@@ -188,9 +196,9 @@
#define _SIMD_ADD _mm256_add_ps
#define _SIMD_XOR _mm256_xor_ps
#define _SIMD_BROADCAST _mm256_broadcast_ss
#define _SIMD_MADDSUB 1
#define _SIMD_SET1 _mm256_set1_ps
#define _SIMD_ADDSUB _mm256_addsub_ps
#define _SIMD_SHUFFLE _mm_shuffle_ps
#define _SIMD_SHUFFLE _mm256_shuffle_ps
#define _SHUFFLE 0xb1
#ifdef HAVE_AVX2
......@@ -462,6 +470,36 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
*/
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f> subroutine single_hh_trafo_complex_AVX_AVX2_1hv_double(q, hh, pnb, pnq, pldq) &
!f> bind(C, name="single_hh_trafo_complex_AVX_AVX2_1hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq
!f> ! complex(kind=c_double_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_double_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f> subroutine single_hh_trafo_complex_AVX_AVX2_1hv_single(q, hh, pnb, pnq, pldq) &
!f> bind(C, name="single_hh_trafo_complex_AVX_AVX2_1hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq
!f> ! complex(kind=c_float_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_float_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
......@@ -492,6 +530,35 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
!f>#endif
*/
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f> subroutine double_hh_trafo_complex_AVX_AVX2_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_AVX_AVX2_2hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> ! complex(kind=c_double_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_double_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f> subroutine double_hh_trafo_complex_AVX_AVX2_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_AVX_AVX2_2hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> ! complex(kind=c_float_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_float_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int* pnb, int* pnq, int* pldq
#ifdef BLOCK1
......@@ -692,17 +759,33 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#ifdef BLOCK2
#if VEC_SET == SSE_128
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#define STEP_SIZE 4
#define UPPER_BOUND 2
#define UPPER_BOUND 3
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 8
#define STEP_SIZE 8
#define UPPER_BOUND 4
#define UPPER_BOUND 6
#endif
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 8
#define STEP_SIZE 8
#define UPPER_BOUND 6
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 16
#define STEP_SIZE 16
#define UPPER_BOUND 12
#endif
#endif /* VEC_SET == AVX_256 */
for (i = 0; i < nq - UPPER_BOUND; i+=STEP_SIZE)
{
......@@ -714,6 +797,34 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
{
return;
}
#if VEC_SET == SSE_128
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 3
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 6
#endif
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 6
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 12
#endif
#endif /* VEC_SET == AVX_256 */
if (nq-i == ROW_LENGTH)
{
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
worked_on += ROW_LENGTH;
}
#if VEC_SET == SSE_128
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 2
......@@ -721,12 +832,25 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#endif
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 8
#endif
#endif /* VEC_SET == AVX_256 */
if (nq-i == ROW_LENGTH)
{
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
worked_on += ROW_LENGTH;
}
#if VEC_SET == SSE_128
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 1
......@@ -734,6 +858,18 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 2
#endif
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 2
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#endif
#endif /* VEC_SET == AVX_256 */
if (nq-i == ROW_LENGTH)
{
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
......@@ -804,6 +940,15 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
__SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
__SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
#endif
#endif /* VEC_SET == AVX_256 */
#ifdef BLOCK2
x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]);
x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]);
......@@ -812,7 +957,6 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
x5 = _SIMD_LOAD(&q_dbl[(2*ldq)+4*offset]);
x6 = _SIMD_LOAD(&q_dbl[(2*ldq)+5*offset]);
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
......@@ -824,6 +968,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
#endif /* VEC_SET == AVX_256 */
#ifndef __ELPA_USE_FMA__
// conjugate
h2_imag = _SIMD_XOR(h2_imag, sign);
......@@ -838,39 +987,39 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
tmp1 = _SIMD_MUL(h2_imag, x1);
#ifdef __ELPA_USE_FMA__
y1 = _SIMD_ADD(y1, _mm_msubadd_pd(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
tmp2 = _SIMD_MUL(h2_imag, x2);
#ifdef __ELPA_USE_FMA__
y2 = _SIMD_ADD(y2, _mm_msubadd_pd(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#else
y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#endif
tmp3 = _SIMD_MUL(h2_imag, x3);
#ifdef __ELPA_USE_FMA__
y3 = _SIMD_ADD(y3, _mm_msubadd_pd(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#else
y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#endif
tmp4 = _SIMD_MUL(h2_imag, x4);
#ifdef __ELPA_USE_FMA__
y4 = _SIMD_ADD(y4, _mm_msubadd_pd(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#else
y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#endif
tmp5 = _SIMD_MUL(h2_imag, x5);
#ifdef __ELPA_USE_FMA__
y5 = _SIMD_ADD(y5, _mm_msubadd_pd(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
y5 = _SIMD_ADD(y5, _SIMD_FMSUBADD(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#else
y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#endif
tmp6 = _SIMD_MUL(h2_imag, x6);
#ifdef __ELPA_USE_FMA__
y6 = _SIMD_ADD(y6, _mm_msubadd_pd(h2_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
y6 = _SIMD_ADD(y6, _SIMD_FMSUBADD(h2_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
#else
y6 = _SIMD_ADD(y6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
#endif
......@@ -900,6 +1049,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
#endif /* VEC_SET == AVX_256 */
#ifndef __ELPA_USE_FMA__
// conjugate
h1_imag = _SIMD_XOR(h1_imag, sign);
......@@ -913,40 +1067,39 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
q6 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
tmp1 = _SIMD_MUL(h1_imag, q1);
#ifdef __ELPA_USE_FMA__
x1 = _SIMD_ADD(x1, _mm_msubadd_pd(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
tmp2 = _SIMD_MUL(h1_imag, q2);
#ifdef __ELPA_USE_FMA__
x2 = _SIMD_ADD(x2, _mm_msubadd_pd(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#else
x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#endif
tmp3 = _SIMD_MUL(h1_imag, q3);
#ifdef __ELPA_USE_FMA__
x3 = _SIMD_ADD(x3, _mm_msubadd_pd(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#else
x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#endif
tmp4 = _SIMD_MUL(h1_imag, q4);
#ifdef __ELPA_USE_FMA__
x4 = _SIMD_ADD(x4, _mm_msubadd_pd(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
x4 = _SIMD_ADD(x4, _SIMD_FMSUBADD(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#else
x4 = _SIMD_ADD(x4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#endif
tmp5 = _SIMD_MUL(h1_imag, q5);
#ifdef __ELPA_USE_FMA__
x5 = _SIMD_ADD(x5, _mm_msubadd_pd(h1_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
x5 = _SIMD_ADD(x5, _SIMD_FMSUBADD(h1_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#else
x5 = _SIMD_ADD(x5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#endif
tmp6 = _SIMD_MUL(h1_imag, q6);
#ifdef __ELPA_USE_FMA__
x6 = _SIMD_ADD(x6, _mm_msubadd_pd(h1_real, q6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
x6 = _SIMD_ADD(x6, _SIMD_FMSUBADD(h1_real, q6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
#else
x6 = _SIMD_ADD(x6, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
#endif
......@@ -964,6 +1117,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
#endif /* VEC_SET == AVX_256 */
#ifndef __ELPA_USE_FMA__
// conjugate
h2_imag = _SIMD_XOR(h2_imag, sign);
......@@ -971,39 +1129,39 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
tmp1 = _SIMD_MUL(h2_imag, q1);
#ifdef __ELPA_USE_FMA__
y1 = _SIMD_ADD(y1, _mm_msubadd_pd(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
tmp2 = _SIMD_MUL(h2_imag, q2);
#ifdef __ELPA_USE_FMA__
y2 = _SIMD_ADD(y2, _mm_msubadd_pd(h2_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#else
y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#endif
tmp3 = _SIMD_MUL(h2_imag, q3);
#ifdef __ELPA_USE_FMA__
y3 = _SIMD_ADD(y3, _mm_msubadd_pd(h2_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#else
y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#endif
tmp4 = _SIMD_MUL(h2_imag, q4);
#ifdef __ELPA_USE_FMA__
y4 = _SIMD_ADD(y4, _mm_msubadd_pd(h2_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#else
y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#endif
tmp5 = _SIMD_MUL(h2_imag, q5);
#ifdef __ELPA_USE_FMA__
y5 = _SIMD_ADD(y5, _mm_msubadd_pd(h2_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
y5 = _SIMD_ADD(y5, _SIMD_FMSUBADD(h2_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#else
y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#endif
tmp6 = _SIMD_MUL(h2_imag, q6);
#ifdef __ELPA_USE_FMA__
y6 = _SIMD_ADD(y6, _mm_msubadd_pd(h2_real, q6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
y6 = _SIMD_ADD(y6, _SIMD_FMSUBADD(h2_real, q6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
#else
y6 = _SIMD_ADD(y6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
#endif
......@@ -1014,6 +1172,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#ifdef BLOCK2
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
......@@ -1022,6 +1181,12 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
#endif
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
#endif /* VEC_SET == AVX_256 */
#ifndef __ELPA_USE_FMA__
// conjugate
......@@ -1037,39 +1202,39 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
tmp1 = _SIMD_MUL(h1_imag, q1);
#ifdef __ELPA_USE_FMA__
x1 = _SIMD_ADD(x1, _mm_msubadd_pd(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
tmp2 = _SIMD_MUL(h1_imag, q2);
#ifdef __ELPA_USE_FMA__
x2 = _SIMD_ADD(x2, _mm_msubadd_pd(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#else
x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#endif
tmp3 = _SIMD_MUL(h1_imag, q3);
#ifdef __ELPA_USE_FMA__
x3 = _SIMD_ADD(x3, _mm_msubadd_pd(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#else
x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#endif
tmp4 = _SIMD_MUL(h1_imag, q4);
#ifdef __ELPA_USE_FMA__
x4 = _SIMD_ADD(x4, _mm_msubadd_pd(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
x4 = _SIMD_ADD(x4, _SIMD_FMSUBADD(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#else
x4 = _SIMD_ADD(x4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#endif
tmp5 = _SIMD_MUL(h1_imag, q5);
#ifdef __ELPA_USE_FMA__
x5 = _SIMD_ADD(x5, _mm_msubadd_pd(h1_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
x5 = _SIMD_ADD(x5, _SIMD_FMSUBADD(h1_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#else
x5 = _SIMD_ADD(x5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#endif
tmp6 = _SIMD_MUL(h1_imag, q6);
#ifdef __ELPA_USE_FMA__
x6 = _SIMD_ADD(x6, _mm_msubadd_pd(h1_real, q6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
x6 = _SIMD_ADD(x6, _SIMD_FMSUBADD(h1_real, q6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
#else
x6 = _SIMD_ADD(x6, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
#endif
......@@ -1087,50 +1252,55 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
h1_real = _SIMD_BROADCAST(&hh_dbl[0]);
h1_imag = _SIMD_BROADCAST(&hh_dbl[1]);
#endif /* VEC_SET == AVX_256 */
h1_real = _SIMD_XOR(h1_real, sign);
h1_imag = _SIMD_XOR(h1_imag, sign);
tmp1 = _SIMD_MUL(h1_imag, x1);
#ifdef __ELPA_USE_FMA__
x1 = _SIMD_MADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#else
x1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif
tmp2 = _SIMD_MUL(h1_imag, x2);
#ifdef __ELPA_USE_FMA__
x2 = _SIMD_MADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
x2 = _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
#else
x2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
#endif
tmp3 = _SIMD_MUL(h1_imag, x3);
#ifdef __ELPA_USE_FMA__
x3 = _SIMD_MADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
x3 = _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
#else
x3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
#endif
tmp4 = _SIMD_MUL(h1_imag, x4);
#ifdef __ELPA_USE_FMA__
x4 = _SIMD_MADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
x4 = _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
#else
x4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
#endif
tmp5 = _SIMD_MUL(h1_imag, x5);
#ifdef __ELPA_USE_FMA__
x5 = _SIMD_MADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
x5 = _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
#else
x5 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
#endif
tmp6 = _SIMD_MUL(h1_imag, x6);
#ifdef __ELPA_USE_FMA__
x6 = _SIMD_MADDSUB(h1_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE));
x6 = _SIMD_FMADDSUB(h1_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE));
#else
x6 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE));
#endif
#ifdef BLOCK2
#if VEC_SET == SSE_128