Commit 7de3d8d3 authored by Andreas Marek's avatar Andreas Marek
Browse files

Fix all sign errors

parent 24586e3d
......@@ -788,37 +788,16 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#endif
#endif /* VEC_SET == AVX_256 */
#if VEC_SET == SSE_128
for (i = 0; i < nq - UPPER_BOUND; i+=STEP_SIZE)
{
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
worked_on +=ROW_LENGTH;
}
#endif
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 2
#define STEP_SIZE 4
#define UPPER_BOUND 2
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#define STEP_SIZE 8
#define UPPER_BOUND 4
#endif
for (i = 0; i < nq - UPPER_BOUND; i+=STEP_SIZE)
{
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i+ROW_LENGTH], hh, nb, ldq, ldh, s);
worked_on +=STEP_SIZE;
}
#endif
if (nq == i)
{
return;
}
#if VEC_SET == SSE_128
#if VEC_SET == SSE_128
#undef ROW_LENGTH
......@@ -873,9 +852,6 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
}
#endif
#if VEC_SET == SSE_128
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
......@@ -903,13 +879,13 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
}
#endif /* BLOCK2 */
//#ifdef WITH_DEBUG
#ifdef WITH_DEBUG
if (worked_on != nq)
{
printf("Error in complex SIMD_SET BLOCK BLOCK kernel %d %d\n", worked_on, nq);
abort();
}
//#endif
#endif
}
......@@ -1378,7 +1354,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
tmp1 = _SIMD_MUL(h2_imag, tmp2);
#ifdef __ELPA_USE_FMA__
tmp2 = _SIMD_FMSUBADD(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#else
tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif
......@@ -1401,78 +1377,78 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
tmp1 = _SIMD_MUL(h1_imag, y1);
#ifdef __ELPA_USE_FMA__
y1 = _SIMD_FMSUBADD(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#else
y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif
tmp2 = _SIMD_MUL(h1_imag, y2);
#ifdef __ELPA_USE_FMA__
y2 = _SIMD_FMSUBADD(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
#else
y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
#endif
tmp3 = _SIMD_MUL(h1_imag, y3);
#ifdef __ELPA_USE_FMA__
y3 = _SIMD_FMSUBADD(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
y3 = _SIMD_FMADDSUB(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
#else
y3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
#endif
tmp4 = _SIMD_MUL(h1_imag, y4);
#ifdef __ELPA_USE_FMA__
y4 = _SIMD_FMSUBADD(h1_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
y4 = _SIMD_FMADDSUB(h1_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
#else
y4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
#endif
tmp5 = _SIMD_MUL(h1_imag, y5);
#ifdef __ELPA_USE_FMA__
y5 = _SIMD_FMSUBADD(h1_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
y5 = _SIMD_FMADDSUB(h1_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
#else
y5 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
#endif
tmp6 = _SIMD_MUL(h1_imag, y6);
#ifdef __ELPA_USE_FMA__
y6 = _SIMD_FMSUBADD(h1_real, y6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE));
y6 = _SIMD_FMADDSUB(h1_real, y6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE));
#else
y6 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE));
#endif
tmp1 = _SIMD_MUL(h2_imag, x1);
#ifdef __ELPA_USE_FMA__
y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
tmp2 = _SIMD_MUL(h2_imag, x2);
#ifdef __ELPA_USE_FMA__
y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#else
y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#endif
tmp3 = _SIMD_MUL(h2_imag, x3);
#ifdef __ELPA_USE_FMA__
y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
y3 = _SIMD_ADD(y3, _SIMD_FMADDSUB(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#else
y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#endif
tmp4 = _SIMD_MUL(h2_imag, x4);
#ifdef __ELPA_USE_FMA__
y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
y4 = _SIMD_ADD(y4, _SIMD_FMADDSUB(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#else
y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#endif
tmp5 = _SIMD_MUL(h2_imag, x5);
#ifdef __ELPA_USE_FMA__
y5 = _SIMD_ADD(y5, _SIMD_FMSUBADD(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
y5 = _SIMD_ADD(y5, _SIMD_FMADDSUB(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#else
y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#endif
tmp6 = _SIMD_MUL(h2_imag, x6);
#ifdef __ELPA_USE_FMA__
y6 = _SIMD_ADD(y6, _SIMD_FMSUBADD(h2_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
y6 = _SIMD_ADD(y6, _SIMD_FMADDSUB(h2_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
#else
y6 = _SIMD_ADD(y6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
#endif
......@@ -1547,39 +1523,39 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
tmp1 = _SIMD_MUL(h2_imag, y1);
#ifdef __ELPA_USE_FMA__
q1 = _SIMD_ADD(q1, _SIMD_FMSUBADD(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
tmp2 = _SIMD_MUL(h2_imag, y2);
#ifdef __ELPA_USE_FMA__
q2 = _SIMD_ADD(q2, _SIMD_FMSUBADD(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#else
q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#endif
tmp3 = _SIMD_MUL(h2_imag, y3);
#ifdef __ELPA_USE_FMA__
q3 = _SIMD_ADD(q3, _SIMD_FMSUBADD(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#else
q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#endif
tmp4 = _SIMD_MUL(h2_imag, y4);
#ifdef __ELPA_USE_FMA__
q4 = _SIMD_ADD(q4, _SIMD_FMSUBADD(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#else
q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#endif
tmp5 = _SIMD_MUL(h2_imag, y5);
#ifdef __ELPA_USE_FMA__
q5 = _SIMD_ADD(q5, _SIMD_FMSUBADD(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#else
q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#endif
tmp6 = _SIMD_MUL(h2_imag, y6);
#ifdef __ELPA_USE_FMA__
q6 = _SIMD_ADD(q6, _SIMD_FMSUBADD(h2_real, y6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
q6 = _SIMD_ADD(q6, _SIMD_FMADDSUB(h2_real, y6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
#else
q6 = _SIMD_ADD(q6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
#endif
......@@ -1678,39 +1654,39 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
tmp1 = _SIMD_MUL(h2_imag, y1);
#ifdef __ELPA_USE_FMA__
q1 = _SIMD_ADD(q1, _SIMD_FMSUBADD(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
tmp2 = _SIMD_MUL(h2_imag, y2);
#ifdef __ELPA_USE_FMA__
q2 = _SIMD_ADD(q2, _SIMD_FMSUBADD(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#else
q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#endif
tmp3 = _SIMD_MUL(h2_imag, y3);
#ifdef __ELPA_USE_FMA__
q3 = _SIMD_ADD(q3, _SIMD_FMSUBADD(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#else
q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#endif
tmp4 = _SIMD_MUL(h2_imag, y4);
#ifdef __ELPA_USE_FMA__
q4 = _SIMD_ADD(q4, _SIMD_FMSUBADD(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#else
q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#endif
tmp5 = _SIMD_MUL(h2_imag, y5);
#ifdef __ELPA_USE_FMA__
q5 = _SIMD_ADD(q5, _SIMD_FMSUBADD(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#else
q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#endif
tmp6 = _SIMD_MUL(h2_imag, y6);
#ifdef __ELPA_USE_FMA__
q6 = _SIMD_ADD(q6, _SIMD_FMSUBADD(h2_real, y6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
q6 = _SIMD_ADD(q6, _SIMD_FMADDSUB(h2_real, y6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
#else
q6 = _SIMD_ADD(q6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
#endif
......@@ -1752,39 +1728,39 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
tmp1 = _SIMD_MUL(h1_imag, x1);
#ifdef __ELPA_USE_FMA__
q1 = _SIMD_ADD(q1, _SIMD_FMSUBADD(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
tmp2 = _SIMD_MUL(h1_imag, x2);
#ifdef __ELPA_USE_FMA__
q2 = _SIMD_ADD(q2, _SIMD_FMSUBADD(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#else
q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#endif
tmp3 = _SIMD_MUL(h1_imag, x3);
#ifdef __ELPA_USE_FMA__
q3 = _SIMD_ADD(q3, _SIMD_FMSUBADD(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#else
q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#endif
tmp4 = _SIMD_MUL(h1_imag, x4);
#ifdef __ELPA_USE_FMA__
q4 = _SIMD_ADD(q4, _SIMD_FMSUBADD(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#else
q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#endif
tmp5 = _SIMD_MUL(h1_imag, x5);
#ifdef __ELPA_USE_FMA__
q5 = _SIMD_ADD(q5, _SIMD_FMSUBADD(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#else
q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#endif
tmp6 = _SIMD_MUL(h1_imag, x6);
#ifdef __ELPA_USE_FMA__
q6 = _SIMD_ADD(q6, _SIMD_FMSUBADD(h1_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
q6 = _SIMD_ADD(q6, _SIMD_FMADDSUB(h1_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
#else
q6 = _SIMD_ADD(q6, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
#endif
......@@ -2232,7 +2208,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
tmp1 = _SIMD_MUL(h2_imag, tmp2);
#ifdef __ELPA_USE_FMA__
tmp2 = _SIMD_FMSUBADD(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#else
tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif
......@@ -2255,66 +2231,66 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
tmp1 = _SIMD_MUL(h1_imag, y1);
#ifdef __ELPA_USE_FMA__
y1 = _SIMD_FMSUBADD(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#else
y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif
tmp2 = _SIMD_MUL(h1_imag, y2);
#ifdef __ELPA_USE_FMA__
y2 = _SIMD_FMSUBADD(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
#else
y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
#endif
tmp3 = _SIMD_MUL(h1_imag, y3);
#ifdef __ELPA_USE_FMA__
y3 = _SIMD_FMSUBADD(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
y3 = _SIMD_FMADDSUB(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
#else
y3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
#endif
tmp4 = _SIMD_MUL(h1_imag, y4);
#ifdef __ELPA_USE_FMA__
y4 = _SIMD_FMSUBADD(h1_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
y4 = _SIMD_FMADDSUB(h1_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
#else
y4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
#endif
tmp5 = _SIMD_MUL(h1_imag, y5);
#ifdef __ELPA_USE_FMA__
y5 = _SIMD_FMSUBADD(h1_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
y5 = _SIMD_FMADDSUB(h1_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
#else
y5 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
#endif
tmp1 = _SIMD_MUL(h2_imag, x1);
#ifdef __ELPA_USE_FMA__
y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
tmp2 = _SIMD_MUL(h2_imag, x2);
#ifdef __ELPA_USE_FMA__
y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#else
y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#endif
tmp3 = _SIMD_MUL(h2_imag, x3);
#ifdef __ELPA_USE_FMA__
y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
y3 = _SIMD_ADD(y3, _SIMD_FMADDSUB(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#else
y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#endif
tmp4 = _SIMD_MUL(h2_imag, x4);
#ifdef __ELPA_USE_FMA__
y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
y4 = _SIMD_ADD(y4, _SIMD_FMADDSUB(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#else
y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#endif
tmp5 = _SIMD_MUL(h2_imag, x5);
#ifdef __ELPA_USE_FMA__
y5 = _SIMD_ADD(y5, _SIMD_FMSUBADD(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
y5 = _SIMD_ADD(y5, _SIMD_FMADDSUB(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#else
y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#endif
......@@ -2383,33 +2359,33 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
tmp1 = _SIMD_MUL(h2_imag, y1);
#ifdef __ELPA_USE_FMA__
q1 = _SIMD_ADD(q1, _SIMD_FMSUBADD(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
tmp2 = _SIMD_MUL(h2_imag, y2);
#ifdef __ELPA_USE_FMA__
q2 = _SIMD_ADD(q2, _SIMD_FMSUBADD(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#else
q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#endif
tmp3 = _SIMD_MUL(h2_imag, y3);
#ifdef __ELPA_USE_FMA__
q3 = _SIMD_ADD(q3, _SIMD_FMSUBADD(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#else
q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#endif
tmp4 = _SIMD_MUL(h2_imag, y4);
#ifdef __ELPA_USE_FMA__
q4 = _SIMD_ADD(q4, _SIMD_FMSUBADD(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#else
q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#endif
tmp5 = _SIMD_MUL(h2_imag, y5);
#ifdef __ELPA_USE_FMA__
q5 = _SIMD_ADD(q5, _SIMD_FMSUBADD(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#else
q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#endif
......@@ -2500,33 +2476,33 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
tmp1 = _SIMD_MUL(h2_imag, y1);
#ifdef __ELPA_USE_FMA__
q1 = _SIMD_ADD(q1, _SIMD_FMSUBADD(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
tmp2 = _SIMD_MUL(h2_imag, y2);
#ifdef __ELPA_USE_FMA__
q2 = _SIMD_ADD(q2, _SIMD_FMSUBADD(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#else
q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#endif
tmp3 = _SIMD_MUL(h2_imag, y3);
#ifdef __ELPA_USE_FMA__
q3 = _SIMD_ADD(q3, _SIMD_FMSUBADD(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#else
q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#endif
tmp4 = _SIMD_MUL(h2_imag, y4);
#ifdef __ELPA_USE_FMA__
q4 = _SIMD_ADD(q4, _SIMD_FMSUBADD(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#else
q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#endif
tmp5 = _SIMD_MUL(h2_imag, y5);
#ifdef __ELPA_USE_FMA__
q5 = _SIMD_ADD(q5, _SIMD_FMSUBADD(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#else
q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#endif
......@@ -2566,33 +2542,33 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
tmp1 = _SIMD_MUL(h1_imag, x1);
#ifdef __ELPA_USE_FMA__
q1 = _SIMD_ADD(q1, _SIMD_FMSUBADD(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
tmp2 = _SIMD_MUL(h1_imag, x2);
#ifdef __ELPA_USE_FMA__
q2 = _SIMD_ADD(q2, _SIMD_FMSUBADD(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#else
q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#endif
tmp3 = _SIMD_MUL(h1_imag, x3);
#ifdef __ELPA_USE_FMA__
q3 = _SIMD_ADD(q3, _SIMD_FMSUBADD(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#else
q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
#endif
tmp4 = _SIMD_MUL(h1_imag, x4);
#ifdef __ELPA_USE_FMA__
q4 = _SIMD_ADD(q4, _SIMD_FMSUBADD(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#else
q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
#endif
tmp5 = _SIMD_MUL(h1_imag, x5);
#ifdef __ELPA_USE_FMA__
q5 = _SIMD_ADD(q5, _SIMD_FMSUBADD(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#else
q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
#endif
......@@ -3003,7 +2979,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
tmp1 = _SIMD_MUL(h2_imag, tmp2);
#ifdef __ELPA_USE_FMA__
tmp2 = _SIMD_FMSUBADD(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#else
tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif
......@@ -3026,56 +3002,56 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
tmp1 = _SIMD_MUL(h1_imag, y1);
#ifdef __ELPA_USE_FMA__
y1 = _SIMD_FMSUBADD(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#else
y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
#endif
tmp2 = _SIMD_MUL(h1_imag, y2);
#ifdef __ELPA_USE_FMA__
y2 = _SIMD_FMSUBADD(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
#else
y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
#endif
tmp3 = _SIMD_MUL(h1_imag, y3);
#ifdef __ELPA_USE_FMA__
y3 = _SIMD_FMSUBADD(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
y3 = _SIMD_FMADDSUB(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
#else
y3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
#endif
tmp4 = _SIMD_MUL(h1_imag, y4);
#ifdef __ELPA_USE_FMA__
y4 = _SIMD_FMSUBADD(h1_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
y4 = _SIMD_FMADDSUB(h1_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
#else
y4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
#endif
tmp1 = _SIMD_MUL(h2_imag, x1);
#ifdef __ELPA_USE_FMA__
y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#else
y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
#endif
tmp2 = _SIMD_MUL(h2_imag, x2);
#ifdef __ELPA_USE_FMA__
y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#else
y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
#endif
tmp3 = _SIMD_MUL(h2_imag, x3);
#ifdef __ELPA_USE_FMA__