Unverified Commit 17b01704 authored by Andreas Marek's avatar Andreas Marek
Browse files

Single precision SSE BLOCK2 complex kernel

parent 0f665949
......@@ -926,18 +926,6 @@ function solve_evp_complex_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, &
THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
endif
#ifndef DOUBLE_PRECISION_COMPLEX
if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE) ) then
else
print *,"At the moment single precision only works with the generic kernels"
stop
endif
#endif
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
if (check_for_gpu(my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then
useGPU=.true.
......@@ -1253,18 +1241,7 @@ function solve_evp_complex_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, &
endif
THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
endif
#ifndef DOUBLE_PRECISION_COMPLEX
if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE) ) then
else
print *,"At the moment single precision only works with the generic kernels"
stop
endif
#endif
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
if (check_for_gpu(my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then
useGPU=.true.
......
......@@ -73,110 +73,9 @@
extern "C" {
//Forward declaration
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv_single(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv_single(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv_single(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(std::complex<float>* q, std::complex<float>* hh, int nb, int ldq, int ldh, std::complex<float> s, std::complex<float> s1);
#if 0
static __forceinline void hh_trafo_complex_kernel_4_C_2hv_single(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
std::complex<double> x1;
std::complex<double> x2;
std::complex<double> x3;
std::complex<double> x4;
std::complex<double> y1;
std::complex<double> y2;
std::complex<double> y3;
std::complex<double> y4;
std::complex<double> h1;
std::complex<double> h2;
std::complex<double> tau1;
std::complex<double> tau2;
int i=0;
x1 = q[ldq+0];
x2 = q[ldq+1];
x3 = q[ldq+2];
x4 = q[ldq+3];
h2 = conj(hh[ldh+1]);
y1 = q[0] + (x1*h2);
y2 = q[1] + (x2*h2);
y3 = q[2] + (x3*h2);
y4 = q[3] + (x4*h2);
for (i = 2; i < nb; i++)
{
h1 = conj(hh[i-1]);
h2 = conj(hh[ldh+i]);
x1 += (q[(i*ldq)+0] * h1);
y1 += (q[(i*ldq)+0] * h2);
x2 += (q[(i*ldq)+1] * h1);
y2 += (q[(i*ldq)+1] * h2);
x3 += (q[(i*ldq)+2] * h1);
y3 += (q[(i*ldq)+2] * h2);
x4 += (q[(i*ldq)+3] * h1);
y4 += (q[(i*ldq)+3] * h2);
}
h1 = conj(hh[nb-1]);
x1 += (q[(nb*ldq)+0] * h1);
x2 += (q[(nb*ldq)+1] * h1);
x3 += (q[(nb*ldq)+2] * h1);
x4 += (q[(nb*ldq)+3] * h1);
tau1 = hh[0];
tau2 = hh[ldh];
h1 = (-1.0)*tau1;
x1 *= h1;
x2 *= h1;
x3 *= h1;
x4 *= h1;
h1 = (-1.0)*tau2;
h2 = (-1.0)*tau2;
h2 *= s;
y1 = y1*h1 +x1*h2;
y2 = y2*h1 +x2*h2;
y3 = y3*h1 +x3*h2;
y4 = y4*h1 +x4*h2;
q[0] += y1;
q[1] += y2;
q[2] += y3;
q[3] += y4;
h2 = hh[ldh+1];
q[ldq+0] += (x1 + (y1*h2));
q[ldq+1] += (x2 + (y2*h2));
q[ldq+2] += (x3 + (y3*h2));
q[ldq+3] += (x4 + (y4*h2));
for (i = 2; i < nb; i++)
{
h1 = hh[i-1];
h2 = hh[ldh+i];
q[(i*ldq)+0] += ((x1*h1) + (y1*h2));
q[(i*ldq)+1] += ((x2*h1) + (y2*h2));
q[(i*ldq)+2] += ((x3*h1) + (y3*h2));
q[(i*ldq)+3] += ((x4*h1) + (y4*h2));
}
h1 = hh[nb-1];
q[(nb*ldq)+0] += (x1*h1);
q[(nb*ldq)+1] += (x2*h1);
q[(nb*ldq)+2] += (x3*h1);
q[(nb*ldq)+3] += (x4*h1);
}
#endif
void double_hh_trafo_complex_sse_2hv_single_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq, int* pldh)
void double_hh_trafo_complex_sse_2hv_single_(std::complex<float>* q, std::complex<float>* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{
int i;
int nb = *pnb;
......@@ -184,1282 +83,448 @@ void double_hh_trafo_complex_sse_2hv_single_(std::complex<double>* q, std::compl
int ldq = *pldq;
int ldh = *pldh;
std::complex<double> s = conj(hh[(ldh)+1])*1.0;
std::complex<float> s = conj(hh[(ldh)+1])*1.0f;
for (i = 2; i < nb; i++)
{
s += hh[i-1] * conj(hh[(i+ldh)]);
}
#if 1
for (i = 0; i < nq; i+=4)
{
hh_trafo_complex_kernel_4_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
}
#else
for (i = 0; i < nq-2; i+=3)
{
hh_trafo_complex_kernel_3_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
}
if (nq-i > 1)
{
hh_trafo_complex_kernel_2_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_4_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s, s);
}
else if (nq-i > 0)
{
hh_trafo_complex_kernel_1_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s);
}
#endif
}
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(std::complex<float>* q, std::complex<float>* hh, int nb, int ldq, int ldh, std::complex<float> s, std::complex<float> s1)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
double* s_dbl = (double*)(&s);
__m128d x1, x2, x3, x4;
__m128d y1, y2, y3, y4;
__m128d q1, q2, q3, q4;
__m128d h1_real, h1_imag, h2_real, h2_imag;
__m128d tmp1, tmp2, tmp3, tmp4;
float* q_dbl = (float*)q;
float* hh_dbl = (float*)hh;
float* s_dbl = (float*)(&s);
__m128 x1, x2, x3, x4;
__m128 y1, y2, y3, y4;
__m128 q1, q2, q3, q4;
__m128 h1_real, h1_imag, h2_real, h2_imag;
__m128 tmp1, tmp2, tmp3, tmp4;
int i=0;
__m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
// __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
__m128 sign = (__m128)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
x1 = _mm_load_pd(&q_dbl[(2*ldq)+0]);
x2 = _mm_load_pd(&q_dbl[(2*ldq)+2]);
x3 = _mm_load_pd(&q_dbl[(2*ldq)+4]);
x4 = _mm_load_pd(&q_dbl[(2*ldq)+6]);
x1 = _mm_load_ps(&q_dbl[(2*ldq)+0]); // q(1,1) ... q(2,1)
x2 = _mm_load_ps(&q_dbl[(2*ldq)+4]); // q(3,1) ... q(4,1)
// x3 = _mm_load_pd(&q_dbl[(2*ldq)+4]);
// x4 = _mm_load_pd(&q_dbl[(2*ldq)+6]);
h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
#ifndef __ELPA_USE_FMA__
// conjugate
h2_imag = _mm_xor_pd(h2_imag, sign);
h2_imag = _mm_xor_ps(h2_imag, sign);
#endif
y1 = _mm_load_pd(&q_dbl[0]);
y2 = _mm_load_pd(&q_dbl[2]);
y3 = _mm_load_pd(&q_dbl[4]);
y4 = _mm_load_pd(&q_dbl[6]);
y1 = _mm_load_ps(&q_dbl[0]);
y2 = _mm_load_ps(&q_dbl[4]);
// y3 = _mm_load_pd(&q_dbl[4]);
// y4 = _mm_load_pd(&q_dbl[6]);
tmp1 = _mm_mul_pd(h2_imag, x1);
tmp1 = _mm_mul_ps(h2_imag, x1);
#ifdef __ELPA_USE_FMA__
y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
y1 = _mm_add_ps(y1, _mm_msubadd_ps(h2_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
#else
y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
y1 = _mm_add_ps(y1, _mm_addsub_ps( _mm_mul_ps(h2_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
#endif
tmp2 = _mm_mul_pd(h2_imag, x2);
tmp2 = _mm_mul_ps(h2_imag, x2);
#ifdef __ELPA_USE_FMA__
y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
y2 = _mm_add_ps(y2, _mm_msubadd_ps(h2_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
#else
y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
y2 = _mm_add_ps(y2, _mm_addsub_ps( _mm_mul_ps(h2_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
#endif
tmp3 = _mm_mul_pd(h2_imag, x3);
// tmp3 = _mm_mul_pd(h2_imag, x3);
#ifdef __ELPA_USE_FMA__
y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
// y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
#else
y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
// y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
#endif
tmp4 = _mm_mul_pd(h2_imag, x4);
// tmp4 = _mm_mul_pd(h2_imag, x4);
#ifdef __ELPA_USE_FMA__
y4 = _mm_add_pd(y4, _mm_msubadd_pd(h2_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
// y4 = _mm_add_pd(y4, _mm_msubadd_pd(h2_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
#else
y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
// y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
#endif
for (i = 2; i < nb; i++)
{
q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
q1 = _mm_load_ps(&q_dbl[(2*i*ldq)+0]);
q2 = _mm_load_ps(&q_dbl[(2*i*ldq)+4]);
// q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
// q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-1)*2]) )));
h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-1)*2)+1]) )));
#ifndef __ELPA_USE_FMA__
// conjugate
h1_imag = _mm_xor_pd(h1_imag, sign);
h1_imag = _mm_xor_ps(h1_imag, sign);
#endif
tmp1 = _mm_mul_pd(h1_imag, q1);
tmp1 = _mm_mul_ps(h1_imag, q1);
#ifdef __ELPA_USE_FMA__
x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
x1 = _mm_add_ps(x1, _mm_msubadd_ps(h1_real, q1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
#else
x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
x1 = _mm_add_ps(x1, _mm_addsub_ps( _mm_mul_ps(h1_real, q1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
#endif
tmp2 = _mm_mul_pd(h1_imag, q2);
tmp2 = _mm_mul_ps(h1_imag, q2);
#ifdef __ELPA_USE_FMA__
x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
x2 = _mm_add_ps(x2, _mm_msubadd_ps(h1_real, q2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
#else
x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
x2 = _mm_add_ps(x2, _mm_addsub_ps( _mm_mul_ps(h1_real, q2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
#endif
tmp3 = _mm_mul_pd(h1_imag, q3);
// tmp3 = _mm_mul_pd(h1_imag, q3);
#ifdef __ELPA_USE_FMA__
x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
// x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
#else
x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
// x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
#endif
tmp4 = _mm_mul_pd(h1_imag, q4);
// tmp4 = _mm_mul_pd(h1_imag, q4);
#ifdef __ELPA_USE_FMA__
x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
// x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
#else
x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
// x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
#endif
h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
#ifndef __ELPA_USE_FMA__
// conjugate
h2_imag = _mm_xor_pd(h2_imag, sign);
h2_imag = _mm_xor_ps(h2_imag, sign);
#endif
tmp1 = _mm_mul_pd(h2_imag, q1);
#ifdef __ELPA_USE_FMA__
y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
#else
y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
#endif
tmp2 = _mm_mul_pd(h2_imag, q2);
#ifdef __ELPA_USE_FMA__
y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
#else
y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
#endif
tmp3 = _mm_mul_pd(h2_imag, q3);
#ifdef __ELPA_USE_FMA__
y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
#else
y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
#endif
tmp4 = _mm_mul_pd(h2_imag, q4);
#ifdef __ELPA_USE_FMA__
y4 = _mm_add_pd(y4, _mm_msubadd_pd(h2_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
#else
y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
#endif
}
h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
#ifndef __ELPA_USE_FMA__
// conjugate
h1_imag = _mm_xor_pd(h1_imag, sign);
#endif
q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]);
q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]);
q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]);
q4 = _mm_load_pd(&q_dbl[(2*nb*ldq)+6]);
tmp1 = _mm_mul_pd(h1_imag, q1);
#ifdef __ELPA_USE_FMA__
x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
#else
x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
#endif
tmp2 = _mm_mul_pd(h1_imag, q2);
#ifdef __ELPA_USE_FMA__
x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
#else
x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
#endif
tmp3 = _mm_mul_pd(h1_imag, q3);
#ifdef __ELPA_USE_FMA__
x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
#else
x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
#endif
tmp4 = _mm_mul_pd(h1_imag, q4);
#ifdef __ELPA_USE_FMA__
x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
#else
x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
#endif
h1_real = _mm_loaddup_pd(&hh_dbl[0]);
h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
h1_real = _mm_xor_pd(h1_real, sign);
h1_imag = _mm_xor_pd(h1_imag, sign);
tmp1 = _mm_mul_pd(h1_imag, x1);
#ifdef __ELPA_USE_FMA__
x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
#else
x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
#endif
tmp2 = _mm_mul_pd(h1_imag, x2);
#ifdef __ELPA_USE_FMA__
x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
#else
x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
#endif
tmp3 = _mm_mul_pd(h1_imag, x3);
#ifdef __ELPA_USE_FMA__
x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
#else
x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
#endif
tmp4 = _mm_mul_pd(h1_imag, x4);
#ifdef __ELPA_USE_FMA__
x4 = _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
#else
x4 = _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
#endif
h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
h1_real = _mm_xor_pd(h1_real, sign);
h1_imag = _mm_xor_pd(h1_imag, sign);
h2_real = _mm_xor_pd(h2_real, sign);
h2_imag = _mm_xor_pd(h2_imag, sign);
tmp2 = _mm_loadu_pd(s_dbl);
tmp1 = _mm_mul_pd(h2_imag, tmp2);
#ifdef __ELPA_USE_FMA__
tmp2 = _mm_maddsub_pd(h2_real, tmp2, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
#else
tmp2 = _mm_addsub_pd( _mm_mul_pd(h2_real, tmp2), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
#endif
_mm_storeu_pd(s_dbl, tmp2);
h2_real = _mm_loaddup_pd(&s_dbl[0]);
h2_imag = _mm_loaddup_pd(&s_dbl[1]);
tmp1 = _mm_mul_pd(h1_imag, y1);
#ifdef __ELPA_USE_FMA__
y1 = _mm_maddsub_pd(h1_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
#else
y1 = _mm_addsub_pd( _mm_mul_pd(h1_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
#endif
tmp2 = _mm_mul_pd(h1_imag, y2);
#ifdef __ELPA_USE_FMA__
y2 = _mm_maddsub_pd(h1_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
#else
y2 = _mm_addsub_pd( _mm_mul_pd(h1_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
#endif
tmp3 = _mm_mul_pd(h1_imag, y3);
#ifdef __ELPA_USE_FMA__
y3 = _mm_maddsub_pd(h1_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
#else
y3 = _mm_addsub_pd( _mm_mul_pd(h1_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
#endif
tmp4 = _mm_mul_pd(h1_imag, y4);
#ifdef __ELPA_USE_FMA__
y4 = _mm_maddsub_pd(h1_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
#else
y4 = _mm_addsub_pd( _mm_mul_pd(h1_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
#endif
tmp1 = _mm_mul_pd(h2_imag, x1);
#ifdef __ELPA_USE_FMA__
y1 = _mm_add_pd(y1, _mm_maddsub_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
#else
y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
#endif
tmp2 = _mm_mul_pd(h2_imag, x2);
#ifdef __ELPA_USE_FMA__
y2 = _mm_add_pd(y2, _mm_maddsub_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
#else
y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
#endif
tmp3 = _mm_mul_pd(h2_imag, x3);
#ifdef __ELPA_USE_FMA__
y3 = _mm_add_pd(y3, _mm_maddsub_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
#else
y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
#endif
tmp4 = _mm_mul_pd(h2_imag, x4);
#ifdef __ELPA_USE_FMA__
y4 = _mm_add_pd(y4, _mm_maddsub_pd(h2_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
#else
y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
#endif
q1 = _mm_load_pd(&q_dbl[0]);
q2 = _mm_load_pd(&q_dbl[2]);
q3 = _mm_load_pd(&q_dbl[4]);
q4 = _mm_load_pd(&q_dbl[6]);
q1 = _mm_add_pd(q1, y1);
q2 = _mm_add_pd(q2, y2);
q3 = _mm_add_pd(q3, y3);
q4 = _mm_add_pd(q4, y4);
_mm_store_pd(&q_dbl[0], q1);
_mm_store_pd(&q_dbl[2], q2);
_mm_store_pd(&q_dbl[4], q3);
_mm_store_pd(&q_dbl[6], q4);
h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
q1 = _mm_load_pd(&q_dbl[(ldq*2)+0]);
q2 = _mm_load_pd(&q_dbl[(ldq*2)+2]);
q3 = _mm_load_pd(&q_dbl[(ldq*2)+4]);
q4 = _mm_load_pd(&q_dbl[(ldq*2)+6]);
q1 = _mm_add_pd(q1, x1);
q2 = _mm_add_pd(q2, x2);
q3 = _mm_add_pd(q3, x3);
q4 = _mm_add_pd(q4, x4);
tmp1 = _mm_mul_pd(h2_imag, y1);
#ifdef __ELPA_USE_FMA__
q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
#else
q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
#endif
tmp2 = _mm_mul_pd(h2_imag, y2);
#ifdef __ELPA_USE_FMA__
q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
#else
q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
#endif
tmp3 = _mm_mul_pd(h2_imag, y3);
#ifdef __ELPA_USE_FMA__
q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
#else
q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));