From 7de3d8d382f96de9abbd972e7fd57d589ea3d86c Mon Sep 17 00:00:00 2001 From: Andreas Marek <andreas.marek@mpcdf.mpg.de> Date: Tue, 11 Jun 2019 19:42:56 +0200 Subject: [PATCH] Fix all sign errors --- ...plex_128bit_256bit_512bit_BLOCK_template.c | 463 ++++-------------- 1 file changed, 107 insertions(+), 356 deletions(-) diff --git a/src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c b/src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c index ba791b78..aeddab6a 100644 --- a/src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c +++ b/src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c @@ -788,37 +788,16 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D #endif #endif /* VEC_SET == AVX_256 */ -#if VEC_SET == SSE_128 for (i = 0; i < nq - UPPER_BOUND; i+=STEP_SIZE) { CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s); worked_on +=ROW_LENGTH; } -#endif -#if VEC_SET == AVX_256 -#ifdef DOUBLE_PRECISION_COMPLEX -#define ROW_LENGTH 2 -#define STEP_SIZE 4 -#define UPPER_BOUND 2 -#endif -#ifdef SINGLE_PRECISION_COMPLEX -#define ROW_LENGTH 4 -#define STEP_SIZE 8 -#define UPPER_BOUND 4 -#endif - for (i = 0; i < nq - UPPER_BOUND; i+=STEP_SIZE) - { - CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s); - CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i+ROW_LENGTH], hh, nb, ldq, ldh, s); - worked_on +=STEP_SIZE; - } -#endif if (nq == i) { return; } -#if VEC_SET == SSE_128 #if VEC_SET == SSE_128 #undef ROW_LENGTH @@ -873,9 +852,6 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D } -#endif - - #if VEC_SET == SSE_128 #undef ROW_LENGTH #ifdef DOUBLE_PRECISION_COMPLEX @@ -903,13 +879,13 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D } #endif /* BLOCK2 */ -//#ifdef WITH_DEBUG +#ifdef WITH_DEBUG if (worked_on != nq) { printf("Error in complex SIMD_SET BLOCK BLOCK kernel %d %d\n", worked_on, nq); abort(); } -//#endif +#endif } @@ -1378,7 +1354,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h2_imag, tmp2); #ifdef __ELPA_USE_FMA__ - tmp2 = _SIMD_FMSUBADD(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); + tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #else tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #endif @@ -1401,78 +1377,78 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h1_imag, y1); #ifdef __ELPA_USE_FMA__ - y1 = _SIMD_FMSUBADD(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); + y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #else y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #endif tmp2 = _SIMD_MUL(h1_imag, y2); #ifdef __ELPA_USE_FMA__ - y2 = _SIMD_FMSUBADD(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)); + y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)); #else y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)); #endif tmp3 = _SIMD_MUL(h1_imag, y3); #ifdef __ELPA_USE_FMA__ - y3 = _SIMD_FMSUBADD(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)); + y3 = _SIMD_FMADDSUB(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)); #else y3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)); #endif tmp4 = _SIMD_MUL(h1_imag, y4); #ifdef __ELPA_USE_FMA__ - y4 = _SIMD_FMSUBADD(h1_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)); + y4 = _SIMD_FMADDSUB(h1_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)); #else y4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)); #endif tmp5 = _SIMD_MUL(h1_imag, y5); #ifdef __ELPA_USE_FMA__ - y5 = _SIMD_FMSUBADD(h1_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)); + y5 = _SIMD_FMADDSUB(h1_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)); #else y5 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)); #endif tmp6 = _SIMD_MUL(h1_imag, y6); #ifdef __ELPA_USE_FMA__ - y6 = _SIMD_FMSUBADD(h1_real, y6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)); + y6 = _SIMD_FMADDSUB(h1_real, y6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)); #else y6 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)); #endif tmp1 = _SIMD_MUL(h2_imag, x1); #ifdef __ELPA_USE_FMA__ - y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #else y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #endif tmp2 = _SIMD_MUL(h2_imag, x2); #ifdef __ELPA_USE_FMA__ - y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); + y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #else y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #endif tmp3 = _SIMD_MUL(h2_imag, x3); #ifdef __ELPA_USE_FMA__ - y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); + y3 = _SIMD_ADD(y3, _SIMD_FMADDSUB(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #else y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #endif tmp4 = _SIMD_MUL(h2_imag, x4); #ifdef __ELPA_USE_FMA__ - y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); + y4 = _SIMD_ADD(y4, _SIMD_FMADDSUB(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #else y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #endif tmp5 = _SIMD_MUL(h2_imag, x5); #ifdef __ELPA_USE_FMA__ - y5 = _SIMD_ADD(y5, _SIMD_FMSUBADD(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); + y5 = _SIMD_ADD(y5, _SIMD_FMADDSUB(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); #else y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); #endif tmp6 = _SIMD_MUL(h2_imag, x6); #ifdef __ELPA_USE_FMA__ - y6 = _SIMD_ADD(y6, _SIMD_FMSUBADD(h2_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE))); + y6 = _SIMD_ADD(y6, _SIMD_FMADDSUB(h2_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE))); #else y6 = _SIMD_ADD(y6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE))); #endif @@ -1547,39 +1523,39 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h2_imag, y1); #ifdef __ELPA_USE_FMA__ - q1 = _SIMD_ADD(q1, _SIMD_FMSUBADD(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #else q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #endif tmp2 = _SIMD_MUL(h2_imag, y2); #ifdef __ELPA_USE_FMA__ - q2 = _SIMD_ADD(q2, _SIMD_FMSUBADD(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); + q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #else q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #endif tmp3 = _SIMD_MUL(h2_imag, y3); #ifdef __ELPA_USE_FMA__ - q3 = _SIMD_ADD(q3, _SIMD_FMSUBADD(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); + q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #else q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #endif tmp4 = _SIMD_MUL(h2_imag, y4); #ifdef __ELPA_USE_FMA__ - q4 = _SIMD_ADD(q4, _SIMD_FMSUBADD(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); + q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #else q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #endif tmp5 = _SIMD_MUL(h2_imag, y5); #ifdef __ELPA_USE_FMA__ - q5 = _SIMD_ADD(q5, _SIMD_FMSUBADD(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); + q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); #else q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); #endif tmp6 = _SIMD_MUL(h2_imag, y6); #ifdef __ELPA_USE_FMA__ - q6 = _SIMD_ADD(q6, _SIMD_FMSUBADD(h2_real, y6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE))); + q6 = _SIMD_ADD(q6, _SIMD_FMADDSUB(h2_real, y6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE))); #else q6 = _SIMD_ADD(q6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE))); #endif @@ -1678,39 +1654,39 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h2_imag, y1); #ifdef __ELPA_USE_FMA__ - q1 = _SIMD_ADD(q1, _SIMD_FMSUBADD(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #else q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #endif tmp2 = _SIMD_MUL(h2_imag, y2); #ifdef __ELPA_USE_FMA__ - q2 = _SIMD_ADD(q2, _SIMD_FMSUBADD(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); + q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #else q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #endif tmp3 = _SIMD_MUL(h2_imag, y3); #ifdef __ELPA_USE_FMA__ - q3 = _SIMD_ADD(q3, _SIMD_FMSUBADD(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); + q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #else q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #endif tmp4 = _SIMD_MUL(h2_imag, y4); #ifdef __ELPA_USE_FMA__ - q4 = _SIMD_ADD(q4, _SIMD_FMSUBADD(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); + q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #else q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #endif tmp5 = _SIMD_MUL(h2_imag, y5); #ifdef __ELPA_USE_FMA__ - q5 = _SIMD_ADD(q5, _SIMD_FMSUBADD(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); + q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); #else q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); #endif tmp6 = _SIMD_MUL(h2_imag, y6); #ifdef __ELPA_USE_FMA__ - q6 = _SIMD_ADD(q6, _SIMD_FMSUBADD(h2_real, y6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE))); + q6 = _SIMD_ADD(q6, _SIMD_FMADDSUB(h2_real, y6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE))); #else q6 = _SIMD_ADD(q6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE))); #endif @@ -1752,39 +1728,39 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h1_imag, x1); #ifdef __ELPA_USE_FMA__ - q1 = _SIMD_ADD(q1, _SIMD_FMSUBADD(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #else q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #endif tmp2 = _SIMD_MUL(h1_imag, x2); #ifdef __ELPA_USE_FMA__ - q2 = _SIMD_ADD(q2, _SIMD_FMSUBADD(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); + q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #else q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #endif tmp3 = _SIMD_MUL(h1_imag, x3); #ifdef __ELPA_USE_FMA__ - q3 = _SIMD_ADD(q3, _SIMD_FMSUBADD(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); + q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #else q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #endif tmp4 = _SIMD_MUL(h1_imag, x4); #ifdef __ELPA_USE_FMA__ - q4 = _SIMD_ADD(q4, _SIMD_FMSUBADD(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); + q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #else q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #endif tmp5 = _SIMD_MUL(h1_imag, x5); #ifdef __ELPA_USE_FMA__ - q5 = _SIMD_ADD(q5, _SIMD_FMSUBADD(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); + q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); #else q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); #endif tmp6 = _SIMD_MUL(h1_imag, x6); #ifdef __ELPA_USE_FMA__ - q6 = _SIMD_ADD(q6, _SIMD_FMSUBADD(h1_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE))); + q6 = _SIMD_ADD(q6, _SIMD_FMADDSUB(h1_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE))); #else q6 = _SIMD_ADD(q6, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE))); #endif @@ -2232,7 +2208,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h2_imag, tmp2); #ifdef __ELPA_USE_FMA__ - tmp2 = _SIMD_FMSUBADD(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); + tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #else tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #endif @@ -2255,66 +2231,66 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h1_imag, y1); #ifdef __ELPA_USE_FMA__ - y1 = _SIMD_FMSUBADD(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); + y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #else y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #endif tmp2 = _SIMD_MUL(h1_imag, y2); #ifdef __ELPA_USE_FMA__ - y2 = _SIMD_FMSUBADD(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)); + y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)); #else y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)); #endif tmp3 = _SIMD_MUL(h1_imag, y3); #ifdef __ELPA_USE_FMA__ - y3 = _SIMD_FMSUBADD(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)); + y3 = _SIMD_FMADDSUB(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)); #else y3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)); #endif tmp4 = _SIMD_MUL(h1_imag, y4); #ifdef __ELPA_USE_FMA__ - y4 = _SIMD_FMSUBADD(h1_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)); + y4 = _SIMD_FMADDSUB(h1_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)); #else y4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)); #endif tmp5 = _SIMD_MUL(h1_imag, y5); #ifdef __ELPA_USE_FMA__ - y5 = _SIMD_FMSUBADD(h1_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)); + y5 = _SIMD_FMADDSUB(h1_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)); #else y5 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)); #endif tmp1 = _SIMD_MUL(h2_imag, x1); #ifdef __ELPA_USE_FMA__ - y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #else y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #endif tmp2 = _SIMD_MUL(h2_imag, x2); #ifdef __ELPA_USE_FMA__ - y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); + y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #else y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #endif tmp3 = _SIMD_MUL(h2_imag, x3); #ifdef __ELPA_USE_FMA__ - y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); + y3 = _SIMD_ADD(y3, _SIMD_FMADDSUB(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #else y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #endif tmp4 = _SIMD_MUL(h2_imag, x4); #ifdef __ELPA_USE_FMA__ - y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); + y4 = _SIMD_ADD(y4, _SIMD_FMADDSUB(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #else y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #endif tmp5 = _SIMD_MUL(h2_imag, x5); #ifdef __ELPA_USE_FMA__ - y5 = _SIMD_ADD(y5, _SIMD_FMSUBADD(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); + y5 = _SIMD_ADD(y5, _SIMD_FMADDSUB(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); #else y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); #endif @@ -2383,33 +2359,33 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h2_imag, y1); #ifdef __ELPA_USE_FMA__ - q1 = _SIMD_ADD(q1, _SIMD_FMSUBADD(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #else q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #endif tmp2 = _SIMD_MUL(h2_imag, y2); #ifdef __ELPA_USE_FMA__ - q2 = _SIMD_ADD(q2, _SIMD_FMSUBADD(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); + q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #else q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #endif tmp3 = _SIMD_MUL(h2_imag, y3); #ifdef __ELPA_USE_FMA__ - q3 = _SIMD_ADD(q3, _SIMD_FMSUBADD(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); + q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #else q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #endif tmp4 = _SIMD_MUL(h2_imag, y4); #ifdef __ELPA_USE_FMA__ - q4 = _SIMD_ADD(q4, _SIMD_FMSUBADD(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); + q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #else q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #endif tmp5 = _SIMD_MUL(h2_imag, y5); #ifdef __ELPA_USE_FMA__ - q5 = _SIMD_ADD(q5, _SIMD_FMSUBADD(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); + q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); #else q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); #endif @@ -2500,33 +2476,33 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h2_imag, y1); #ifdef __ELPA_USE_FMA__ - q1 = _SIMD_ADD(q1, _SIMD_FMSUBADD(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #else q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #endif tmp2 = _SIMD_MUL(h2_imag, y2); #ifdef __ELPA_USE_FMA__ - q2 = _SIMD_ADD(q2, _SIMD_FMSUBADD(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); + q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #else q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #endif tmp3 = _SIMD_MUL(h2_imag, y3); #ifdef __ELPA_USE_FMA__ - q3 = _SIMD_ADD(q3, _SIMD_FMSUBADD(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); + q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #else q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #endif tmp4 = _SIMD_MUL(h2_imag, y4); #ifdef __ELPA_USE_FMA__ - q4 = _SIMD_ADD(q4, _SIMD_FMSUBADD(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); + q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #else q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #endif tmp5 = _SIMD_MUL(h2_imag, y5); #ifdef __ELPA_USE_FMA__ - q5 = _SIMD_ADD(q5, _SIMD_FMSUBADD(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); + q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); #else q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); #endif @@ -2566,33 +2542,33 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h1_imag, x1); #ifdef __ELPA_USE_FMA__ - q1 = _SIMD_ADD(q1, _SIMD_FMSUBADD(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #else q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #endif tmp2 = _SIMD_MUL(h1_imag, x2); #ifdef __ELPA_USE_FMA__ - q2 = _SIMD_ADD(q2, _SIMD_FMSUBADD(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); + q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #else q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #endif tmp3 = _SIMD_MUL(h1_imag, x3); #ifdef __ELPA_USE_FMA__ - q3 = _SIMD_ADD(q3, _SIMD_FMSUBADD(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); + q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #else q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #endif tmp4 = _SIMD_MUL(h1_imag, x4); #ifdef __ELPA_USE_FMA__ - q4 = _SIMD_ADD(q4, _SIMD_FMSUBADD(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); + q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #else q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #endif tmp5 = _SIMD_MUL(h1_imag, x5); #ifdef __ELPA_USE_FMA__ - q5 = _SIMD_ADD(q5, _SIMD_FMSUBADD(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); + q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); #else q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE))); #endif @@ -3003,7 +2979,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h2_imag, tmp2); #ifdef __ELPA_USE_FMA__ - tmp2 = _SIMD_FMSUBADD(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); + tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #else tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #endif @@ -3026,56 +3002,56 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h1_imag, y1); #ifdef __ELPA_USE_FMA__ - y1 = _SIMD_FMSUBADD(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); + y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #else y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #endif tmp2 = _SIMD_MUL(h1_imag, y2); #ifdef __ELPA_USE_FMA__ - y2 = _SIMD_FMSUBADD(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)); + y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)); #else y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)); #endif tmp3 = _SIMD_MUL(h1_imag, y3); #ifdef __ELPA_USE_FMA__ - y3 = _SIMD_FMSUBADD(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)); + y3 = _SIMD_FMADDSUB(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)); #else y3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)); #endif tmp4 = _SIMD_MUL(h1_imag, y4); #ifdef __ELPA_USE_FMA__ - y4 = _SIMD_FMSUBADD(h1_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)); + y4 = _SIMD_FMADDSUB(h1_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)); #else y4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)); #endif tmp1 = _SIMD_MUL(h2_imag, x1); #ifdef __ELPA_USE_FMA__ - y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #else y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #endif tmp2 = _SIMD_MUL(h2_imag, x2); #ifdef __ELPA_USE_FMA__ - y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); + y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #else y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #endif tmp3 = _SIMD_MUL(h2_imag, x3); #ifdef __ELPA_USE_FMA__ - y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); + y3 = _SIMD_ADD(y3, _SIMD_FMADDSUB(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #else y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #endif tmp4 = _SIMD_MUL(h2_imag, x4); #ifdef __ELPA_USE_FMA__ - y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); + y4 = _SIMD_ADD(y4, _SIMD_FMADDSUB(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #else y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #endif @@ -3135,30 +3111,29 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM q4 = _SIMD_ADD(q4, x4); tmp1 = _SIMD_MUL(h2_imag, y1); - #ifdef __ELPA_USE_FMA__ - q1 = _SIMD_ADD(q1, _SIMD_FMSUBADD(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #else q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #endif tmp2 = _SIMD_MUL(h2_imag, y2); #ifdef __ELPA_USE_FMA__ - q2 = _SIMD_ADD(q2, _SIMD_FMSUBADD(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); + q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #else q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #endif tmp3 = _SIMD_MUL(h2_imag, y3); #ifdef __ELPA_USE_FMA__ - q3 = _SIMD_ADD(q3, _SIMD_FMSUBADD(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); + q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #else q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #endif tmp4 = _SIMD_MUL(h2_imag, y4); #ifdef __ELPA_USE_FMA__ - q4 = _SIMD_ADD(q4, _SIMD_FMSUBADD(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); + q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #else q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #endif @@ -3241,26 +3216,26 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h2_imag, y1); #ifdef __ELPA_USE_FMA__ - q1 = _SIMD_ADD(q1, _SIMD_FMSUBADD(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #else q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #endif tmp2 = _SIMD_MUL(h2_imag, y2); #ifdef __ELPA_USE_FMA__ - q2 = _SIMD_ADD(q2, _SIMD_FMSUBADD(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); + q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #else q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #endif tmp3 = _SIMD_MUL(h2_imag, y3); #ifdef __ELPA_USE_FMA__ - q3 = _SIMD_ADD(q3, _SIMD_FMSUBADD(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); + q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #else q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #endif tmp4 = _SIMD_MUL(h2_imag, y4); #ifdef __ELPA_USE_FMA__ - q4 = _SIMD_ADD(q4, _SIMD_FMSUBADD(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); + q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #else q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #endif @@ -3298,26 +3273,26 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h1_imag, x1); #ifdef __ELPA_USE_FMA__ - q1 = _SIMD_ADD(q1, _SIMD_FMSUBADD(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #else q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #endif tmp2 = _SIMD_MUL(h1_imag, x2); #ifdef __ELPA_USE_FMA__ - q2 = _SIMD_ADD(q2, _SIMD_FMSUBADD(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); + q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #else q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #endif tmp3 = _SIMD_MUL(h1_imag, x3); #ifdef __ELPA_USE_FMA__ - q3 = _SIMD_ADD(q3, _SIMD_FMSUBADD(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); + q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #else q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #endif tmp4 = _SIMD_MUL(h1_imag, x4); #ifdef __ELPA_USE_FMA__ - q4 = _SIMD_ADD(q4, _SIMD_FMSUBADD(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); + q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #else q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE))); #endif @@ -3688,7 +3663,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h2_imag, tmp2); #ifdef __ELPA_USE_FMA__ - tmp2 = _SIMD_FMSUBADD(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); + tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #else tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #endif @@ -3711,42 +3686,42 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h1_imag, y1); #ifdef __ELPA_USE_FMA__ - y1 = _SIMD_FMSUBADD(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); + y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #else y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #endif tmp2 = _SIMD_MUL(h1_imag, y2); #ifdef __ELPA_USE_FMA__ - y2 = _SIMD_FMSUBADD(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)); + y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)); #else y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)); #endif tmp3 = _SIMD_MUL(h1_imag, y3); #ifdef __ELPA_USE_FMA__ - y3 = _SIMD_FMSUBADD(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)); + y3 = _SIMD_FMADDSUB(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)); #else y3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)); #endif tmp1 = _SIMD_MUL(h2_imag, x1); #ifdef __ELPA_USE_FMA__ - y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #else y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #endif tmp2 = _SIMD_MUL(h2_imag, x2); #ifdef __ELPA_USE_FMA__ - y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); + y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #else y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #endif tmp3 = _SIMD_MUL(h2_imag, x3); #ifdef __ELPA_USE_FMA__ - y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); + y3 = _SIMD_ADD(y3, _SIMD_FMADDSUB(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #else y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #endif @@ -3802,21 +3777,21 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h2_imag, y1); #ifdef __ELPA_USE_FMA__ - q1 = _SIMD_ADD(q1, _SIMD_FMSUBADD(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #else q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #endif tmp2 = _SIMD_MUL(h2_imag, y2); #ifdef __ELPA_USE_FMA__ - q2 = _SIMD_ADD(q2, _SIMD_FMSUBADD(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); + q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #else q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #endif tmp3 = _SIMD_MUL(h2_imag, y3); #ifdef __ELPA_USE_FMA__ - q3 = _SIMD_ADD(q3, _SIMD_FMSUBADD(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); + q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #else q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #endif @@ -3890,20 +3865,20 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h2_imag, y1); #ifdef __ELPA_USE_FMA__ - q1 = _SIMD_ADD(q1, _SIMD_FMSUBADD(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #else q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #endif tmp2 = _SIMD_MUL(h2_imag, y2); #ifdef __ELPA_USE_FMA__ - q2 = _SIMD_ADD(q2, _SIMD_FMSUBADD(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); + q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #else q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #endif tmp3 = _SIMD_MUL(h2_imag, y3); #ifdef __ELPA_USE_FMA__ - q3 = _SIMD_ADD(q3, _SIMD_FMSUBADD(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); + q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #else q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #endif @@ -3939,20 +3914,20 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h1_imag, x1); #ifdef __ELPA_USE_FMA__ - q1 = _SIMD_ADD(q1, _SIMD_FMSUBADD(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #else q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #endif tmp2 = _SIMD_MUL(h1_imag, x2); #ifdef __ELPA_USE_FMA__ - q2 = _SIMD_ADD(q2, _SIMD_FMSUBADD(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); + q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #else q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #endif tmp3 = _SIMD_MUL(h1_imag, x3); #ifdef __ELPA_USE_FMA__ - q3 = _SIMD_ADD(q3, _SIMD_FMSUBADD(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); + q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #else q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE))); #endif @@ -4214,7 +4189,6 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM h1_imag = _SIMD_XOR(h1_imag, sign); tmp1 = _SIMD_MUL(h1_imag, x1); - #ifdef __ELPA_USE_FMA__ x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #else @@ -4282,7 +4256,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h2_imag, tmp2); #ifdef __ELPA_USE_FMA__ - tmp2 = _SIMD_FMSUBADD(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); + tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #else tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #endif @@ -4305,26 +4279,26 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h1_imag, y1); #ifdef __ELPA_USE_FMA__ - y1 = _SIMD_FMSUBADD(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); + y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #else y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #endif tmp2 = _SIMD_MUL(h1_imag, y2); #ifdef __ELPA_USE_FMA__ - y2 = _SIMD_FMSUBADD(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)); + y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)); #else y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)); #endif tmp1 = _SIMD_MUL(h2_imag, x1); #ifdef __ELPA_USE_FMA__ - y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #else y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #endif tmp2 = _SIMD_MUL(h2_imag, x2); #ifdef __ELPA_USE_FMA__ - y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); + y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #else y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #endif @@ -4373,13 +4347,13 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h2_imag, y1); #ifdef __ELPA_USE_FMA__ - q1 = _SIMD_ADD(q1, _SIMD_FMSUBADD(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #else q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #endif tmp2 = _SIMD_MUL(h2_imag, y2); #ifdef __ELPA_USE_FMA__ - q2 = _SIMD_ADD(q2, _SIMD_FMSUBADD(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); + q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #else q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #endif @@ -4445,13 +4419,13 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h2_imag, y1); #ifdef __ELPA_USE_FMA__ - q1 = _SIMD_ADD(q1, _SIMD_FMSUBADD(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #else q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #endif tmp2 = _SIMD_MUL(h2_imag, y2); #ifdef __ELPA_USE_FMA__ - q2 = _SIMD_ADD(q2, _SIMD_FMSUBADD(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); + q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #else q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #endif @@ -4484,13 +4458,13 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM tmp1 = _SIMD_MUL(h1_imag, x1); #ifdef __ELPA_USE_FMA__ - q1 = _SIMD_ADD(q1, _SIMD_FMSUBADD(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #else q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); #endif tmp2 = _SIMD_MUL(h1_imag, x2); #ifdef __ELPA_USE_FMA__ - q2 = _SIMD_ADD(q2, _SIMD_FMSUBADD(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); + q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #else q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE))); #endif @@ -4502,10 +4476,6 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM } -//if VEC_SET == SSE_128 || (VEC_SET == AVX_256 && BLOCK == 1) - - - #if VEC_SET == SSE_128 #ifdef DOUBLE_PRECISION_COMPLEX #define ROW_LENGTH 1 @@ -4953,222 +4923,3 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM #endif /* BLOCK2 */ } - -//#endif - -#if 0 -//if VEC_SET == AVX_256 && BLOCK == 2 - -#ifdef DOUBLE_PRECISION_COMPLEX -static __forceinline void hh_trafo_complex_kernel_2_AVX_AVX2_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s) -#endif -#ifdef SINGLE_PRECISION_COMPLEX -static __forceinline void hh_trafo_complex_kernel_4_AVX_AVX2_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s) -#endif - -{ -#ifdef DOUBLE_PRECISION_COMPLEX - double* q_dbl = (double*)q; - double* hh_dbl = (double*)hh; - double* s_dbl = (double*)(&s); -#endif -#ifdef SINGLE_PRECISION_COMPLEX - float* q_dbl = (float*)q; - float* hh_dbl = (float*)hh; - float* s_dbl = (float*)(&s); -#endif - __SIMD_DATATYPE x1; - __SIMD_DATATYPE y1; - __SIMD_DATATYPE q1; - __SIMD_DATATYPE h1_real, h1_imag, h2_real, h2_imag; - __SIMD_DATATYPE tmp1; - int i=0; -#ifdef DOUBLE_PRECISION_COMPLEX - __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); -#endif -#ifdef SINGLE_PRECISION_COMPLEX - __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000); -#endif - x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]); - - h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]); - h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]); -#ifndef __ELPA_USE_FMA__ - // conjugate - h2_imag = _SIMD_XOR(h2_imag, sign); -#endif - - y1 = _SIMD_LOAD(&q_dbl[0]); - - tmp1 = _SIMD_MUL(h2_imag, x1); -#ifdef __ELPA_USE_FMA__ - y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); -#else - y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); -#endif - - for (i = 2; i < nb; i++) - { - q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]); - - h1_real = _SIMD_BROADCAST(&hh_dbl[(i-1)*2]); - h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-1)*2)+1]); -#ifndef __ELPA_USE_FMA__ - // conjugate - h1_imag = _SIMD_XOR(h1_imag, sign); -#endif - - tmp1 = _SIMD_MUL(h1_imag, q1); -#ifdef __ELPA_USE_FMA__ - x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); -#else - x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); -#endif - - h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]); - h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]); -#ifndef __ELPA_USE_FMA__ - // conjugate - h2_imag = _SIMD_XOR(h2_imag, sign); -#endif - - tmp1 = _SIMD_MUL(h2_imag, q1); -#ifdef __ELPA_USE_FMA__ - y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); -#else - y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); -#endif - } - - h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]); - h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]); -#ifndef __ELPA_USE_FMA__ - // conjugate - h1_imag = _SIMD_XOR(h1_imag, sign); -#endif - - q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]); - - tmp1 = _SIMD_MUL(h1_imag, q1); -#ifdef __ELPA_USE_FMA__ - x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); -#else - x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); -#endif - - h1_real = _SIMD_BROADCAST(&hh_dbl[0]); - h1_imag = _SIMD_BROADCAST(&hh_dbl[1]); - h1_real = _SIMD_XOR(h1_real, sign); - h1_imag = _SIMD_XOR(h1_imag, sign); - - tmp1 = _SIMD_MUL(h1_imag, x1); -#ifdef __ELPA_USE_FMA__ - x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); -#else - x1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); -#endif - - h1_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]); - h1_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]); - h2_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]); - h2_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]); - - h1_real = _SIMD_XOR(h1_real, sign); - h1_imag = _SIMD_XOR(h1_imag, sign); - h2_real = _SIMD_XOR(h2_real, sign); - h2_imag = _SIMD_XOR(h2_imag, sign); - - __SIMD_DATATYPE tmp2; -#ifdef DOUBLE_PRECISION_COMPLEX - tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]); -#endif -#ifdef SINGLE_PRECISION_COMPLEX - tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0], - s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]); -#endif - tmp1 = _SIMD_MUL(h2_imag, tmp2); -#ifdef __ELPA_USE_FMA__ - tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); -#else - tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); -#endif - h2_real = _SIMD_SET1(tmp2[0]); - h2_imag = _SIMD_SET1(tmp2[1]); - - tmp1 = _SIMD_MUL(h1_imag, y1); -#ifdef __ELPA_USE_FMA__ - y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); -#else - y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); -#endif - - tmp1 = _SIMD_MUL(h2_imag, x1); -#ifdef __ELPA_USE_FMA__ - y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); -#else - y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); -#endif - - q1 = _SIMD_LOAD(&q_dbl[0]); - - q1 = _SIMD_ADD(q1, y1); - - _SIMD_STORE(&q_dbl[0], q1); - - h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]); - h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]); - - q1 = _SIMD_LOAD(&q_dbl[(ldq*2)+0]); - - q1 = _SIMD_ADD(q1, x1); - - tmp1 = _SIMD_MUL(h2_imag, y1); -#ifdef __ELPA_USE_FMA__ - q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); -#else - q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); -#endif - - _SIMD_STORE(&q_dbl[(ldq*2)+0], q1); - - for (i = 2; i < nb; i++) - { - q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]); - - h1_real = _SIMD_BROADCAST(&hh_dbl[(i-1)*2]); - h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-1)*2)+1]); - - tmp1 = _SIMD_MUL(h1_imag, x1); -#ifdef __ELPA_USE_FMA__ - q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); -#else - q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); -#endif - - h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]); - h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]); - - tmp1 = _SIMD_MUL(h2_imag, y1); -#ifdef __ELPA_USE_FMA__ - q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); -#else - q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); -#endif - - _SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1); - } - h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]); - h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]); - - q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]); - - tmp1 = _SIMD_MUL(h1_imag, x1); -#ifdef __ELPA_USE_FMA__ - q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); -#else - q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); -#endif - - _SIMD_STORE(&q_dbl[(2*nb*ldq)+0], q1); -} -#endif -- GitLab