Commit 9413f74b authored by Andreas Marek's avatar Andreas Marek
Browse files

Try bigger step size for real single-precision AVX2 block 6 kernel

parent a07bcc94
......@@ -71,6 +71,7 @@
//Forward declaration
static void hh_trafo_kernel_4_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_16_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
void hexa_hh_trafo_real_avx_avx2_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
......@@ -196,23 +197,9 @@ void hexa_hh_trafo_real_avx_avx2_6hv_single(float* q, float* hh, int* pnb, int*
}
// Production level kernel calls with padding
#ifdef __AVX__
for (i = 0; i < nq-4; i+=8)
for (i = 0; i < nq-8; i+=16)
{
hh_trafo_kernel_8_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
}
if (nq == i)
{
return;
}
else
{
hh_trafo_kernel_4_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
}
#else
for (i = 0; i < nq-2; i+=4)
{
hh_trafo_kernel_4_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
hh_trafo_kernel_16_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
}
if (nq == i)
{
......@@ -220,19 +207,18 @@ void hexa_hh_trafo_real_avx_avx2_6hv_single(float* q, float* hh, int* pnb, int*
}
else
{
hh_trafo_kernel_2_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
hh_trafo_kernel_8_AVX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
}
#endif
}
/**
* Unrolled kernel that computes
* 8 rows of Q simultaneously, a
* 16 rows of Q simultaneously, a
* matrix vector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
__forceinline void hh_trafo_kernel_16_AVX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [8 x nb+3] * hh
......@@ -245,7 +231,7 @@ __forceinline void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb,
__m256 a3_1 = _mm256_load_ps(&q[ldq*3]);
__m256 a4_1 = _mm256_load_ps(&q[ldq*2]);
__m256 a5_1 = _mm256_load_ps(&q[ldq]);
__m256 a6_1 = _mm256_load_ps(&q[0]); // q(1,1) | q(2,1) | q(3,1) | q(4,1)
__m256 a6_1 = _mm256_load_ps(&q[0]); // q(1,1) | q(2,1) | q(3,1) | q(4,1) .. q(8,1)
__m256 h_6_5 = _mm256_broadcast_ss(&hh[(ldh*5)+1]);
__m256 h_6_4 = _mm256_broadcast_ss(&hh[(ldh*5)+2]);
......@@ -307,50 +293,50 @@ __forceinline void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb,
register __m256 x1 = a1_1;
// __m256d a1_2 = _mm256_load_pd(&q[(ldq*5)+4]);
// __m256d a2_2 = _mm256_load_pd(&q[(ldq*4)+4]);
// __m256d a3_2 = _mm256_load_pd(&q[(ldq*3)+4]);
// __m256d a4_2 = _mm256_load_pd(&q[(ldq*2)+4]);
// __m256d a5_2 = _mm256_load_pd(&q[(ldq)+4]);
// __m256d a6_2 = _mm256_load_pd(&q[4]);
#ifdef __ELPA_USE_FMA__
// register __m256d t2 = _mm256_FMA_pd(a5_2, h_6_5, a6_2);
// t2 = _mm256_FMA_pd(a4_2, h_6_4, t2);
// t2 = _mm256_FMA_pd(a3_2, h_6_3, t2);
// t2 = _mm256_FMA_pd(a2_2, h_6_2, t2);
// t2 = _mm256_FMA_pd(a1_2, h_6_1, t2);
// register __m256d v2 = _mm256_FMA_pd(a4_2, h_5_4, a5_2);
// v2 = _mm256_FMA_pd(a3_2, h_5_3, v2);
// v2 = _mm256_FMA_pd(a2_2, h_5_2, v2);
// v2 = _mm256_FMA_pd(a1_2, h_5_1, v2);
// register __m256d w2 = _mm256_FMA_pd(a3_2, h_4_3, a4_2);
// w2 = _mm256_FMA_pd(a2_2, h_4_2, w2);
// w2 = _mm256_FMA_pd(a1_2, h_4_1, w2);
// register __m256d z2 = _mm256_FMA_pd(a2_2, h_3_2, a3_2);
// z2 = _mm256_FMA_pd(a1_2, h_3_1, z2);
// register __m256d y2 = _mm256_FMA_pd(a1_2, h_2_1, a2_2);
#else
// register __m256d t2 = _mm256_add_pd(a6_2, _mm256_mul_pd(a5_2, h_6_5));
// t2 = _mm256_add_pd(t2, _mm256_mul_pd(a4_2, h_6_4));
// t2 = _mm256_add_pd(t2, _mm256_mul_pd(a3_2, h_6_3));
// t2 = _mm256_add_pd(t2, _mm256_mul_pd(a2_2, h_6_2));
// t2 = _mm256_add_pd(t2, _mm256_mul_pd(a1_2, h_6_1));
// register __m256d v2 = _mm256_add_pd(a5_2, _mm256_mul_pd(a4_2, h_5_4));
// v2 = _mm256_add_pd(v2, _mm256_mul_pd(a3_2, h_5_3));
// v2 = _mm256_add_pd(v2, _mm256_mul_pd(a2_2, h_5_2));
// v2 = _mm256_add_pd(v2, _mm256_mul_pd(a1_2, h_5_1));
// register __m256d w2 = _mm256_add_pd(a4_2, _mm256_mul_pd(a3_2, h_4_3));
// w2 = _mm256_add_pd(w2, _mm256_mul_pd(a2_2, h_4_2));
// w2 = _mm256_add_pd(w2, _mm256_mul_pd(a1_2, h_4_1));
// register __m256d z2 = _mm256_add_pd(a3_2, _mm256_mul_pd(a2_2, h_3_2));
// z2 = _mm256_add_pd(z2, _mm256_mul_pd(a1_2, h_3_1));
// register __m256d y2 = _mm256_add_pd(a2_2, _mm256_mul_pd(a1_2, h_2_1));
#endif
// register __m256d x2 = a1_2;
__m256 a1_2 = _mm256_load_ps(&q[(ldq*5)+8]);
__m256 a2_2 = _mm256_load_ps(&q[(ldq*4)+8]);
__m256 a3_2 = _mm256_load_ps(&q[(ldq*3)+8]);
__m256 a4_2 = _mm256_load_ps(&q[(ldq*2)+8]);
__m256 a5_2 = _mm256_load_ps(&q[(ldq)+8]);
__m256 a6_2 = _mm256_load_ps(&q[8]);
#ifdef __ELPA_USE_FMA__
register __m256 t2 = _mm256_FMA_ps(a5_2, h_6_5, a6_2);
t2 = _mm256_FMA_ps(a4_2, h_6_4, t2);
t2 = _mm256_FMA_ps(a3_2, h_6_3, t2);
t2 = _mm256_FMA_ps(a2_2, h_6_2, t2);
t2 = _mm256_FMA_ps(a1_2, h_6_1, t2);
register __m256 v2 = _mm256_FMA_ps(a4_2, h_5_4, a5_2);
v2 = _mm256_FMA_ps(a3_2, h_5_3, v2);
v2 = _mm256_FMA_ps(a2_2, h_5_2, v2);
v2 = _mm256_FMA_ps(a1_2, h_5_1, v2);
register __m256 w2 = _mm256_FMA_ps(a3_2, h_4_3, a4_2);
w2 = _mm256_FMA_ps(a2_2, h_4_2, w2);
w2 = _mm256_FMA_ps(a1_2, h_4_1, w2);
register __m256 z2 = _mm256_FMA_ps(a2_2, h_3_2, a3_2);
z2 = _mm256_FMA_ps(a1_2, h_3_1, z2);
register __m256 y2 = _mm256_FMA_ps(a1_2, h_2_1, a2_2);
#else
register __m256 t2 = _mm256_add_ps(a6_2, _mm256_mul_ps(a5_2, h_6_5));
t2 = _mm256_add_ps(t2, _mm256_mul_ps(a4_2, h_6_4));
t2 = _mm256_add_ps(t2, _mm256_mul_ps(a3_2, h_6_3));
t2 = _mm256_add_ps(t2, _mm256_mul_ps(a2_2, h_6_2));
t2 = _mm256_add_ps(t2, _mm256_mul_ps(a1_2, h_6_1));
register __m256 v2 = _mm256_add_ps(a5_2, _mm256_mul_ps(a4_2, h_5_4));
v2 = _mm256_add_ps(v2, _mm256_mul_ps(a3_2, h_5_3));
v2 = _mm256_add_ps(v2, _mm256_mul_ps(a2_2, h_5_2));
v2 = _mm256_add_ps(v2, _mm256_mul_ps(a1_2, h_5_1));
register __m256 w2 = _mm256_add_ps(a4_2, _mm256_mul_ps(a3_2, h_4_3));
w2 = _mm256_add_ps(w2, _mm256_mul_ps(a2_2, h_4_2));
w2 = _mm256_add_ps(w2, _mm256_mul_ps(a1_2, h_4_1));
register __m256 z2 = _mm256_add_ps(a3_2, _mm256_mul_ps(a2_2, h_3_2));
z2 = _mm256_add_ps(z2, _mm256_mul_ps(a1_2, h_3_1));
register __m256 y2 = _mm256_add_ps(a2_2, _mm256_mul_ps(a1_2, h_2_1));
#endif
register __m256 x2 = a1_2;
__m256 q1;
// __m256d q2;
__m256 q2;
__m256 h1;
__m256 h2;
......@@ -363,189 +349,189 @@ __forceinline void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb,
{
h1 = _mm256_broadcast_ss(&hh[i-5]);
q1 = _mm256_load_ps(&q[i*ldq]);
// q2 = _mm256_load_pd(&q[(i*ldq)+4]);
q2 = _mm256_load_ps(&q[(i*ldq)+8]);
#ifdef __ELPA_USE_FMA__
x1 = _mm256_FMA_ps(q1, h1, x1);
// x2 = _mm256_FMA_pd(q2, h1, x2);
x2 = _mm256_FMA_ps(q2, h1, x2);
#else
x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1));
// x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
x2 = _mm256_add_ps(x2, _mm256_mul_ps(q2,h1));
#endif
h2 = _mm256_broadcast_ss(&hh[ldh+i-4]);
#ifdef __ELPA_USE_FMA__
y1 = _mm256_FMA_ps(q1, h2, y1);
// y2 = _mm256_FMA_pd(q2, h2, y2);
y2 = _mm256_FMA_ps(q2, h2, y2);
#else
y1 = _mm256_add_ps(y1, _mm256_mul_ps(q1,h2));
// y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
y2 = _mm256_add_ps(y2, _mm256_mul_ps(q2,h2));
#endif
h3 = _mm256_broadcast_ss(&hh[(ldh*2)+i-3]);
#ifdef __ELPA_USE_FMA__
z1 = _mm256_FMA_ps(q1, h3, z1);
// z2 = _mm256_FMA_pd(q2, h3, z2);
z2 = _mm256_FMA_ps(q2, h3, z2);
#else
z1 = _mm256_add_ps(z1, _mm256_mul_ps(q1,h3));
// z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3));
z2 = _mm256_add_ps(z2, _mm256_mul_ps(q2,h3));
#endif
h4 = _mm256_broadcast_ss(&hh[(ldh*3)+i-2]);
#ifdef __ELPA_USE_FMA__
w1 = _mm256_FMA_ps(q1, h4, w1);
// w2 = _mm256_FMA_pd(q2, h4, w2);
w2 = _mm256_FMA_ps(q2, h4, w2);
#else
w1 = _mm256_add_ps(w1, _mm256_mul_ps(q1,h4));
// w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4));
w2 = _mm256_add_ps(w2, _mm256_mul_ps(q2,h4));
#endif
h5 = _mm256_broadcast_ss(&hh[(ldh*4)+i-1]);
#ifdef __ELPA_USE_FMA__
v1 = _mm256_FMA_ps(q1, h5, v1);
// v2 = _mm256_FMA_pd(q2, h5, v2);
v2 = _mm256_FMA_ps(q2, h5, v2);
#else
v1 = _mm256_add_ps(v1, _mm256_mul_ps(q1,h5));
// v2 = _mm256_add_pd(v2, _mm256_mul_pd(q2,h5));
v2 = _mm256_add_ps(v2, _mm256_mul_ps(q2,h5));
#endif
h6 = _mm256_broadcast_ss(&hh[(ldh*5)+i]);
#ifdef __ELPA_USE_FMA__
t1 = _mm256_FMA_ps(q1, h6, t1);
// t2 = _mm256_FMA_pd(q2, h6, t2);
t2 = _mm256_FMA_ps(q2, h6, t2);
#else
t1 = _mm256_add_ps(t1, _mm256_mul_ps(q1,h6));
// t2 = _mm256_add_pd(t2, _mm256_mul_pd(q2,h6));
t2 = _mm256_add_ps(t2, _mm256_mul_ps(q2,h6));
#endif
}
h1 = _mm256_broadcast_ss(&hh[nb-5]);
q1 = _mm256_load_ps(&q[nb*ldq]);
// q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
q2 = _mm256_load_ps(&q[(nb*ldq)+8]);
#ifdef __ELPA_USE_FMA__
x1 = _mm256_FMA_ps(q1, h1, x1);
// x2 = _mm256_FMA_pd(q2, h1, x2);
x2 = _mm256_FMA_ps(q2, h1, x2);
#else
x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1));
// x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
x2 = _mm256_add_ps(x2, _mm256_mul_ps(q2,h1));
#endif
h2 = _mm256_broadcast_ss(&hh[ldh+nb-4]);
#ifdef __ELPA_USE_FMA__
y1 = _mm256_FMA_ps(q1, h2, y1);
// y2 = _mm256_FMA_pd(q2, h2, y2);
y2 = _mm256_FMA_ps(q2, h2, y2);
#else
y1 = _mm256_add_ps(y1, _mm256_mul_ps(q1,h2));
// y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
y2 = _mm256_add_ps(y2, _mm256_mul_ps(q2,h2));
#endif
h3 = _mm256_broadcast_ss(&hh[(ldh*2)+nb-3]);
#ifdef __ELPA_USE_FMA__
z1 = _mm256_FMA_ps(q1, h3, z1);
// z2 = _mm256_FMA_pd(q2, h3, z2);
z2 = _mm256_FMA_ps(q2, h3, z2);
#else
z1 = _mm256_add_ps(z1, _mm256_mul_ps(q1,h3));
// z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3));
z2 = _mm256_add_ps(z2, _mm256_mul_ps(q2,h3));
#endif
h4 = _mm256_broadcast_ss(&hh[(ldh*3)+nb-2]);
#ifdef __ELPA_USE_FMA__
w1 = _mm256_FMA_ps(q1, h4, w1);
// w2 = _mm256_FMA_pd(q2, h4, w2);
w2 = _mm256_FMA_ps(q2, h4, w2);
#else
w1 = _mm256_add_ps(w1, _mm256_mul_ps(q1,h4));
// w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4));
w2 = _mm256_add_ps(w2, _mm256_mul_ps(q2,h4));
#endif
h5 = _mm256_broadcast_ss(&hh[(ldh*4)+nb-1]);
#ifdef __ELPA_USE_FMA__
v1 = _mm256_FMA_ps(q1, h5, v1);
// v2 = _mm256_FMA_pd(q2, h5, v2);
v2 = _mm256_FMA_ps(q2, h5, v2);
#else
v1 = _mm256_add_ps(v1, _mm256_mul_ps(q1,h5));
// v2 = _mm256_add_pd(v2, _mm256_mul_pd(q2,h5));
v2 = _mm256_add_ps(v2, _mm256_mul_ps(q2,h5));
#endif
h1 = _mm256_broadcast_ss(&hh[nb-4]);
q1 = _mm256_load_ps(&q[(nb+1)*ldq]);
// q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]);
q2 = _mm256_load_ps(&q[((nb+1)*ldq)+4]);
#ifdef __ELPA_USE_FMA__
x1 = _mm256_FMA_ps(q1, h1, x1);
// x2 = _mm256_FMA_pd(q2, h1, x2);
x2 = _mm256_FMA_ps(q2, h1, x2);
#else
x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1));
// x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
x2 = _mm256_add_ps(x2, _mm256_mul_ps(q2,h1));
#endif
h2 = _mm256_broadcast_ss(&hh[ldh+nb-3]);
#ifdef __ELPA_USE_FMA__
y1 = _mm256_FMA_ps(q1, h2, y1);
// y2 = _mm256_FMA_pd(q2, h2, y2);
y2 = _mm256_FMA_ps(q2, h2, y2);
#else
y1 = _mm256_add_ps(y1, _mm256_mul_ps(q1,h2));
// y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
y2 = _mm256_add_ps(y2, _mm256_mul_ps(q2,h2));
#endif
h3 = _mm256_broadcast_ss(&hh[(ldh*2)+nb-2]);
#ifdef __ELPA_USE_FMA__
z1 = _mm256_FMA_ps(q1, h3, z1);
// z2 = _mm256_FMA_pd(q2, h3, z2);
z2 = _mm256_FMA_ps(q2, h3, z2);
#else
z1 = _mm256_add_ps(z1, _mm256_mul_ps(q1,h3));
// z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3));
z2 = _mm256_add_ps(z2, _mm256_mul_ps(q2,h3));
#endif
h4 = _mm256_broadcast_ss(&hh[(ldh*3)+nb-1]);
#ifdef __ELPA_USE_FMA__
w1 = _mm256_FMA_ps(q1, h4, w1);
// w2 = _mm256_FMA_pd(q2, h4, w2);
w2 = _mm256_FMA_ps(q2, h4, w2);
#else
w1 = _mm256_add_ps(w1, _mm256_mul_ps(q1,h4));
// w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4));
w2 = _mm256_add_ps(w2, _mm256_mul_ps(q2,h4));
#endif
h1 = _mm256_broadcast_ss(&hh[nb-3]);
q1 = _mm256_load_ps(&q[(nb+2)*ldq]);
// q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]);
q2 = _mm256_load_ps(&q[((nb+2)*ldq)+8]);
#ifdef __ELPA_USE_FMA__
x1 = _mm256_FMA_ps(q1, h1, x1);
// x2 = _mm256_FMA_pd(q2, h1, x2);
x2 = _mm256_FMA_ps(q2, h1, x2);
#else
x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1));
// x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
x2 = _mm256_add_ps(x2, _mm256_mul_ps(q2,h1));
#endif
h2 = _mm256_broadcast_ss(&hh[ldh+nb-2]);
#ifdef __ELPA_USE_FMA__
y1 = _mm256_FMA_ps(q1, h2, y1);
// y2 = _mm256_FMA_pd(q2, h2, y2);
y2 = _mm256_FMA_ps(q2, h2, y2);
#else
y1 = _mm256_add_ps(y1, _mm256_mul_ps(q1,h2));
// y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
y2 = _mm256_add_ps(y2, _mm256_mul_ps(q2,h2));
#endif
h3 = _mm256_broadcast_ss(&hh[(ldh*2)+nb-1]);
#ifdef __ELPA_USE_FMA__
z1 = _mm256_FMA_ps(q1, h3, z1);
// z2 = _mm256_FMA_pd(q2, h3, z2);
z2 = _mm256_FMA_ps(q2, h3, z2);
#else
z1 = _mm256_add_ps(z1, _mm256_mul_ps(q1,h3));
// z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3));
z2 = _mm256_add_ps(z2, _mm256_mul_ps(q2,h3));
#endif
h1 = _mm256_broadcast_ss(&hh[nb-2]);
q1 = _mm256_load_ps(&q[(nb+3)*ldq]);
// q2 = _mm256_load_pd(&q[((nb+3)*ldq)+4]);
q2 = _mm256_load_ps(&q[((nb+3)*ldq)+8]);
#ifdef __ELPA_USE_FMA__
x1 = _mm256_FMA_ps(q1, h1, x1);
// x2 = _mm256_FMA_pd(q2, h1, x2);
x2 = _mm256_FMA_ps(q2, h1, x2);
#else
x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1));
// x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
x2 = _mm256_add_ps(x2, _mm256_mul_ps(q2,h1));
#endif
h2 = _mm256_broadcast_ss(&hh[ldh+nb-1]);
#ifdef __ELPA_USE_FMA__
y1 = _mm256_FMA_ps(q1, h2, y1);
// y2 = _mm256_FMA_pd(q2, h2, y2);
y2 = _mm256_FMA_ps(q2, h2, y2);
#else
y1 = _mm256_add_ps(y1, _mm256_mul_ps(q1,h2));
// y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
y2 = _mm256_add_ps(y2, _mm256_mul_ps(q2,h2));
#endif
h1 = _mm256_broadcast_ss(&hh[nb-1]);
q1 = _mm256_load_ps(&q[(nb+4)*ldq]);
// q2 = _mm256_load_pd(&q[((nb+4)*ldq)+4]);
q2 = _mm256_load_ps(&q[((nb+4)*ldq)+8]);
#ifdef __ELPA_USE_FMA__
x1 = _mm256_FMA_ps(q1, h1, x1);
// x2 = _mm256_FMA_pd(q2, h1, x2);
x2 = _mm256_FMA_ps(q2, h1, x2);
#else
x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1));
// x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
x2 = _mm256_add_ps(x2, _mm256_mul_ps(q2,h1));
#endif
/////////////////////////////////////////////////////
......@@ -554,17 +540,17 @@ __forceinline void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb,
__m256 tau1 = _mm256_broadcast_ss(&hh[0]);
x1 = _mm256_mul_ps(x1, tau1);
// x2 = _mm256_mul_pd(x2, tau1);
x2 = _mm256_mul_ps(x2, tau1);
__m256 tau2 = _mm256_broadcast_ss(&hh[ldh]);
__m256 vs_1_2 = _mm256_broadcast_ss(&scalarprods[0]);
h2 = _mm256_mul_ps(tau2, vs_1_2);
#ifdef __ELPA_USE_FMA__
y1 = _mm256_FMSUB_ps(y1, tau2, _mm256_mul_ps(x1,h2));
// y2 = _mm256_FMSUB_pd(y2, tau2, _mm256_mul_pd(x2,h2));
y2 = _mm256_FMSUB_ps(y2, tau2, _mm256_mul_ps(x2,h2));
#else
y1 = _mm256_sub_ps(_mm256_mul_ps(y1,tau2), _mm256_mul_ps(x1,h2));
// y2 = _mm256_sub_pd(_mm256_mul_pd(y2,tau2), _mm256_mul_pd(x2,h2));
y2 = _mm256_sub_ps(_mm256_mul_ps(y2,tau2), _mm256_mul_ps(x2,h2));
#endif
__m256 tau3 = _mm256_broadcast_ss(&hh[ldh*2]);
......@@ -574,10 +560,10 @@ __forceinline void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb,
h3 = _mm256_mul_ps(tau3, vs_2_3);
#ifdef __ELPA_USE_FMA__
z1 = _mm256_FMSUB_ps(z1, tau3, _mm256_FMA_ps(y1, h3, _mm256_mul_ps(x1,h2)));
// z2 = _mm256_FMSUB_pd(z2, tau3, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2)));
z2 = _mm256_FMSUB_ps(z2, tau3, _mm256_FMA_ps(y2, h3, _mm256_mul_ps(x2,h2)));
#else
z1 = _mm256_sub_ps(_mm256_mul_ps(z1,tau3), _mm256_add_ps(_mm256_mul_ps(y1,h3), _mm256_mul_ps(x1,h2)));
// z2 = _mm256_sub_pd(_mm256_mul_pd(z2,tau3), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2)));
z2 = _mm256_sub_ps(_mm256_mul_ps(z2,tau3), _mm256_add_ps(_mm256_mul_ps(y2,h3), _mm256_mul_ps(x2,h2)));
#endif
__m256 tau4 = _mm256_broadcast_ss(&hh[ldh*3]);
......@@ -589,10 +575,10 @@ __forceinline void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb,
h4 = _mm256_mul_ps(tau4, vs_3_4);
#ifdef __ELPA_USE_FMA__
w1 = _mm256_FMSUB_ps(w1, tau4, _mm256_FMA_ps(z1, h4, _mm256_FMA_ps(y1, h3, _mm256_mul_ps(x1,h2))));
// w2 = _mm256_FMSUB_pd(w2, tau4, _mm256_FMA_pd(z2, h4, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2))));
w2 = _mm256_FMSUB_ps(w2, tau4, _mm256_FMA_ps(z2, h4, _mm256_FMA_ps(y2, h3, _mm256_mul_ps(x2,h2))));
#else
w1 = _mm256_sub_ps(_mm256_mul_ps(w1,tau4), _mm256_add_ps(_mm256_mul_ps(z1,h4), _mm256_add_ps(_mm256_mul_ps(y1,h3), _mm256_mul_ps(x1,h2))));
// w2 = _mm256_sub_pd(_mm256_mul_pd(w2,tau4), _mm256_add_pd(_mm256_mul_pd(z2,h4), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2))));
w2 = _mm256_sub_ps(_mm256_mul_ps(w2,tau4), _mm256_add_ps(_mm256_mul_ps(z2,h4), _mm256_add_ps(_mm256_mul_ps(y2,h3), _mm256_mul_ps(x2,h2))));
#endif
__m256 tau5 = _mm256_broadcast_ss(&hh[ldh*4]);
......@@ -606,10 +592,10 @@ __forceinline void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb,
h5 = _mm256_mul_ps(tau5, vs_4_5);
#ifdef __ELPA_USE_FMA__
v1 = _mm256_FMSUB_ps(v1, tau5, _mm256_add_ps(_mm256_FMA_ps(w1, h5, _mm256_mul_ps(z1,h4)), _mm256_FMA_ps(y1, h3, _mm256_mul_ps(x1,h2))));
// v2 = _mm256_FMSUB_pd(v2, tau5, _mm256_add_pd(_mm256_FMA_pd(w2, h5, _mm256_mul_pd(z2,h4)), _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2))));
v2 = _mm256_FMSUB_ps(v2, tau5, _mm256_add_ps(_mm256_FMA_ps(w2, h5, _mm256_mul_ps(z2,h4)), _mm256_FMA_ps(y2, h3, _mm256_mul_ps(x2,h2))));
#else
v1 = _mm256_sub_ps(_mm256_mul_ps(v1,tau5), _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(w1,h5), _mm256_mul_ps(z1,h4)), _mm256_add_ps(_mm256_mul_ps(y1,h3), _mm256_mul_ps(x1,h2))));
// v2 = _mm256_sub_pd(_mm256_mul_pd(v2,tau5), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w2,h5), _mm256_mul_pd(z2,h4)), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2))));
v2 = _mm256_sub_ps(_mm256_mul_ps(v2,tau5), _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(w2,h5), _mm256_mul_ps(z2,h4)), _mm256_add_ps(_mm256_mul_ps(y2,h3), _mm256_mul_ps(x2,h2))));
#endif
__m256 tau6 = _mm256_broadcast_ss(&hh[ldh*5]);
......@@ -625,385 +611,385 @@ __forceinline void hh_trafo_kernel_8_AVX_6hv_single(float* q, float* hh, int nb,
h6 = _mm256_mul_ps(tau6, vs_5_6);
#ifdef __ELPA_USE_FMA__
t1 = _mm256_FMSUB_ps(t1, tau6, _mm256_FMA_ps(v1, h6, _mm256_add_ps(_mm256_FMA_ps(w1, h5, _mm256_mul_ps(z1,h4)), _mm256_FMA_ps(y1, h3, _mm256_mul_ps(x1,h2)))));
// t2 = _mm256_FMSUB_pd(t2, tau6, _mm256_FMA_pd(v2, h6, _mm256_add_pd(_mm256_FMA_pd(w2, h5, _mm256_mul_pd(z2,h4)), _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2)))));
t2 = _mm256_FMSUB_ps(t2, tau6, _mm256_FMA_ps(v2, h6, _mm256_add_ps(_mm256_FMA_ps(w2, h5, _mm256_mul_ps(z2,h4)), _mm256_FMA_ps(y2, h3, _mm256_mul_ps(x2,h2)))));
#else
t1 = _mm256_sub_ps(_mm256_mul_ps(t1,tau6), _mm256_add_ps( _mm256_mul_ps(v1,h6), _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(w1,h5), _mm256_mul_ps(z1,h4)), _mm256_add_ps(_mm256_mul_ps(y1,h3), _mm256_mul_ps(x1,h2)))));
// t2 = _mm256_sub_pd(_mm256_mul_pd(t2,tau6), _mm256_add_pd( _mm256_mul_pd(v2,h6), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w2,h5), _mm256_mul_pd(z2,h4)), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2)))));
t2 = _mm256_sub_ps(_mm256_mul_ps(t2,tau6), _mm256_add_ps( _mm256_mul_ps(v2,h6), _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(w2,h5), _mm256_mul_ps(z2,h4)), _mm256_add_ps(_mm256_mul_ps(y2,h3), _mm256_mul_ps(x2,h2)))));
#endif
/////////////////////////////////////////////////////
// Rank-1 update of Q [8 x nb+3]
// Rank-1 upsate of Q [8 x nb+3]
/////////////////////////////////////////////////////
q1 = _mm256_load_ps(&q[0]);
// q2 = _mm256_load_pd(&q[4]);
q2 = _mm256_load_ps(&q[8]);
q1 = _mm256_sub_ps(q1, t1);
// q2 = _mm256_sub_pd(q2, t2);
q2 = _mm256_sub_ps(q2, t2);
_mm256_store_ps(&q[0],q1);
// _mm256_store_pd(&q[4],q2);
_mm256_store_ps(&q[8],q2);
h6 = _mm256_broadcast_ss(&hh[(ldh*5)+1]);
q1 = _mm256_load_ps(&q[ldq]);
// q2 = _mm256_load_pd(&q[(ldq+4)]);
q2 = _mm256_load_ps(&q[(ldq+8)]);
q1 = _mm256_sub_ps(q1, v1);
// q2 = _mm256_sub_pd(q2, v2);
q2 = _mm256_sub_ps(q2, v2);
#ifdef __ELPA_USE_FMA__
q1 = _mm256_NFMA_ps(t1, h6, q1);
// q2 = _mm256_NFMA_pd(t2, h6, q2);
q2 = _mm256_NFMA_ps(t2, h6, q2);
#else
q1 = _mm256_sub_ps(q1, _mm256_mul_ps(t1, h6));
// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6));
q2 = _mm256_sub_ps(q2, _mm256_mul_ps(t2, h6));
#endif
_mm256_store_ps(&q[ldq],q1);
// _mm256_store_pd(&q[(ldq+4)],q2);
_mm256_store_ps(&q[(ldq+8)],q2);
h5 = _mm256_broadcast_ss(&hh[(ldh*4)+1]);
q1 = _mm256_load_ps(&q[ldq*2]);
// q2 = _mm256_load_pd(&q[(ldq*2)+4]);
q2 = _mm256_load_ps(&q[(ldq*2)+8]);
q1 = _mm256_sub_ps(q1, w1);
// q2 = _mm256_sub_pd(q2, w2);
q2 = _mm256_sub_ps(q2, w2);
#ifdef __ELPA_USE_FMA__
q1 = _mm256_NFMA_ps(v1, h5, q1);
// q2 = _mm256_NFMA_pd(v2, h5, q2);
q2 = _mm256_NFMA_ps(v2, h5, q2);
#else
q1 = _mm256_sub_ps(q1, _mm256_mul_ps(v1, h5));
// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5));
q2 = _mm256_sub_ps(q2, _mm256_mul_ps(v2, h5));
#endif
h6 = _mm256_broadcast_ss(&hh[(ldh*5)+2]);
#ifdef __ELPA_USE_FMA__
q1 = _mm256_NFMA_ps(t1, h6, q1);
// q2 = _mm256_NFMA_pd(t2, h6, q2);
q2 = _mm256_NFMA_ps(t2, h6, q2);
#else
q1 = _mm256_sub_ps(q1, _mm256_mul_ps(t1, h6));
// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6));
q2 = _mm256_sub_ps(q2, _mm256_mul_ps(t2, h6));
#endif
_mm256_store_ps(&q[ldq*2],q1);
// _mm256_store_pd(&q[(ldq*2)+4],q2);
_mm256_store_ps(&q[(ldq*2)+8],q2);
h4 = _mm256_broadcast_ss(&hh[(ldh*3)+1]);
q1 = _mm256_load_ps(&q[ldq*3]);
// q2 = _mm256_load_pd(&q[(ldq*3)+4]);
q2 = _mm256_load_ps(&q[(ldq*3)+8]);
q1 = _mm256_sub_ps(q1, z1);
// q2 = _mm256_sub_pd(q2, z2);
q2 = _mm256_sub_ps(q2, z2);
#ifdef __ELPA_USE_FMA__
q1 = _mm256_NFMA_ps(w1, h4, q1);
// q2 = _mm256_NFMA_pd(w2, h4, q2);
q2 = _mm256_NFMA_ps(w2, h4, q2);
#else
q1 = _mm256_sub_ps(q1, _mm256_mul_ps(w1, h4));
// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4));
q2 = _mm256_sub_ps(q2, _mm256_mul_ps(w2, h4));
#endif
h5 = _mm256_broadcast_ss(&hh[(ldh*4)+2]);
#ifdef __ELPA_USE_FMA__
q1 = _mm256_NFMA_ps(v1, h5, q1);
// q2 = _mm256_NFMA_pd(v2, h5, q2);
q2 = _mm256_NFMA_ps(v2, h5, q2);
#else
q1 = _mm256_sub_ps(q1, _mm256_mul_ps(v1, h5));
// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5));
q2 = _mm256_sub_ps(q2, _mm256_mul_ps(v2, h5));
#endif
h6 = _mm256_broadcast_ss(&hh[(ldh*5)+3]);
#ifdef __ELPA_USE_FMA__
<