Commit ae81ebca authored by Alexander Heinecke's avatar Alexander Heinecke

finalized 1hv kernel for complex numbers which is slightly faster than 2hv kernel -> default

parent 953a4ed6
......@@ -3198,29 +3198,33 @@ contains
subroutine compute_hh_trafo(off, ncols, istripe)
integer off, ncols, istripe, j, nl, jj
complex*16 w(nbw,2)
real*8 ttt
ttt = mpi_wtime()
nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
!FORTRAN CODE / X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
call double_hh_trafo_complex(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
enddo
if(j==1) call single_hh_trafo_complex(a(1,1+off+a_off,istripe),bcast_buffer(1,off+1), nbw, nl, stripe_width)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
! Currently (on Sandy Bridge), single is faster than double
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
! integer off, ncols, istripe, j, nl, jj
! real*8 ttt
! complex*16 w(nbw,2)
!
! ttt = mpi_wtime()
! nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
! do j = ncols, 1, -1
! call single_hh_trafo_complex(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
! do j = ncols, 2, -2
! w(:,1) = bcast_buffer(1:nbw,j+off)
! w(:,2) = bcast_buffer(1:nbw,j+off-1)
! call double_hh_trafo_complex(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
! enddo
! if(j==1) call single_hh_trafo_complex(a(1,1+off+a_off,istripe),bcast_buffer(1,off+1), nbw, nl, stripe_width)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
! Currently (on Sandy Bridge), single is faster than double
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
integer off, ncols, istripe, j, nl, jj
real*8 ttt
ttt = mpi_wtime()
nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
do j = ncols, 1, -1
call single_hh_trafo_complex(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
enddo
kernel_flops = kernel_flops + 4*4*int(nl,8)*int(ncols,8)*int(nbw,8)
kernel_time = kernel_time + mpi_wtime()-ttt
......
......@@ -25,8 +25,10 @@
//Forward declaration
#ifdef __AVX__
extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
//extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
extern "C" __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
extern "C" __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
#else
extern "C" __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
#endif
......@@ -144,14 +146,18 @@ extern "C" void double_hh_trafo_complex_(std::complex<double>* q, std::complex<d
}
#ifdef __AVX__
for (i = 0; i < nq-4; i+=8)
for (i = 0; i < nq-4; i+=6)
{
hh_trafo_complex_kernel_8_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_complex_kernel_6_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
}
if (i < nq)
if (nq-i > 2)
{
hh_trafo_complex_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
}
else if (nq-i > 0)
{
hh_trafo_complex_kernel_2_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
}
#else
for (i = 0; i < nq; i+=4)
{
......@@ -161,6 +167,7 @@ extern "C" void double_hh_trafo_complex_(std::complex<double>* q, std::complex<d
}
#ifdef __AVX__
#if 0
extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
double* q_dbl = (double*)q;
......@@ -465,6 +472,276 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
_mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3);
_mm256_store_pd(&q_dbl[(2*nb*ldq)+12], q4);
}
#endif
extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
double* s_dbl = (double*)(&s);
__m256d x1, x2, x3;
__m256d y1, y2, y3;
__m256d q1, q2, q3;
__m256d h1_real, h1_imag, h2_real, h2_imag;
__m256d tmp1, tmp2, tmp3;
int i=0;
__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
//x1 = q[ldq+0];
//x2 = q[ldq+1];
//x3 = q[ldq+2];
//x4 = q[ldq+3];
x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]);
x2 = _mm256_load_pd(&q_dbl[(2*ldq)+4]);
x3 = _mm256_load_pd(&q_dbl[(2*ldq)+8]);
//h2 = conj(hh[ldh+1]);
h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
// conjugate
h2_imag = _mm256_xor_pd(h2_imag, sign);
//y1 = q[0] + (x1*h2);
//y2 = q[1] + (x2*h2);
//y3 = q[2] + (x3*h2);
//y4 = q[3] + (x4*h2);
y1 = _mm256_load_pd(&q_dbl[0]);
y2 = _mm256_load_pd(&q_dbl[4]);
y3 = _mm256_load_pd(&q_dbl[8]);
tmp1 = _mm256_mul_pd(h2_imag, x1);
y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
tmp2 = _mm256_mul_pd(h2_imag, x2);
y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
tmp3 = _mm256_mul_pd(h2_imag, x3);
y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
for (i = 2; i < nb; i++)
{
//h1 = conj(hh[i-1]);
//h2 = conj(hh[ldh+i]);
//x1 += (q[(i*ldq)+0] * h1);
//y1 += (q[(i*ldq)+0] * h2);
//x2 += (q[(i*ldq)+1] * h1);
//y2 += (q[(i*ldq)+1] * h2);
//x3 += (q[(i*ldq)+2] * h1);
//y3 += (q[(i*ldq)+2] * h2);
//x4 += (q[(i*ldq)+3] * h1);
//y4 += (q[(i*ldq)+3] * h2);
q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
// conjugate
h1_imag = _mm256_xor_pd(h1_imag, sign);
tmp1 = _mm256_mul_pd(h1_imag, q1);
x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
tmp2 = _mm256_mul_pd(h1_imag, q2);
x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
tmp3 = _mm256_mul_pd(h1_imag, q3);
x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
// conjugate
h2_imag = _mm256_xor_pd(h2_imag, sign);
tmp1 = _mm256_mul_pd(h2_imag, q1);
y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
tmp2 = _mm256_mul_pd(h2_imag, q2);
y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
tmp3 = _mm256_mul_pd(h2_imag, q3);
y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
}
//h1 = conj(hh[nb-1]);
//x1 += (q[(nb*ldq)+0] * h1);
//x2 += (q[(nb*ldq)+1] * h1);
//x3 += (q[(nb*ldq)+2] * h1);
//x4 += (q[(nb*ldq)+3] * h1);
h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
// conjugate
h1_imag = _mm256_xor_pd(h1_imag, sign);
q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]);
q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]);
tmp1 = _mm256_mul_pd(h1_imag, q1);
x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
tmp2 = _mm256_mul_pd(h1_imag, q2);
x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
tmp3 = _mm256_mul_pd(h1_imag, q3);
x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
//tau1 = hh[0];
//h1 = (-1.0)*tau1;
h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
h1_real = _mm256_xor_pd(h1_real, sign);
h1_imag = _mm256_xor_pd(h1_imag, sign);
//x1 *= h1;
//x2 *= h1;
//x3 *= h1;
//x4 *= h1;
tmp1 = _mm256_mul_pd(h1_imag, x1);
x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
tmp2 = _mm256_mul_pd(h1_imag, x2);
x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
tmp3 = _mm256_mul_pd(h1_imag, x3);
x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
//tau2 = hh[ldh];
//h1 = (-1.0)*tau2;
//h2 = (-1.0)*tau2;
h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
h2_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
h1_real = _mm256_xor_pd(h1_real, sign);
h1_imag = _mm256_xor_pd(h1_imag, sign);
h2_real = _mm256_xor_pd(h2_real, sign);
h2_imag = _mm256_xor_pd(h2_imag, sign);
//h2 *= s;
__m128d tmp_s_128 = _mm_loadu_pd(s_dbl);
tmp2 = _mm256_broadcast_pd(&tmp_s_128);
tmp1 = _mm256_mul_pd(h2_imag, tmp2);
tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
_mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2));
h2_real = _mm256_broadcast_sd(&s_dbl[0]);
h2_imag = _mm256_broadcast_sd(&s_dbl[1]);
//y1 = y1*h1 +x1*h2;
//y2 = y2*h1 +x2*h2;
//y3 = y3*h1 +x3*h2;
//y4 = y4*h1 +x4*h2;
tmp1 = _mm256_mul_pd(h1_imag, y1);
y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
tmp2 = _mm256_mul_pd(h1_imag, y2);
y2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
tmp3 = _mm256_mul_pd(h1_imag, y3);
y3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
// y1+=x1*h2
tmp1 = _mm256_mul_pd(h2_imag, x1);
y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
tmp2 = _mm256_mul_pd(h2_imag, x2);
y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
tmp3 = _mm256_mul_pd(h2_imag, x3);
y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
q1 = _mm256_load_pd(&q_dbl[0]);
q2 = _mm256_load_pd(&q_dbl[4]);
q3 = _mm256_load_pd(&q_dbl[8]);
//q[0] += y1;
//q[1] += y2;
//q[2] += y3;
//q[3] += y4;
q1 = _mm256_add_pd(q1, y1);
q2 = _mm256_add_pd(q2, y2);
q3 = _mm256_add_pd(q3, y3);
_mm256_store_pd(&q_dbl[0], q1);
_mm256_store_pd(&q_dbl[4], q2);
_mm256_store_pd(&q_dbl[8], q3);
//h2 = hh[ldh+1];
h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
q1 = _mm256_load_pd(&q_dbl[(ldq*2)+0]);
q2 = _mm256_load_pd(&q_dbl[(ldq*2)+4]);
q3 = _mm256_load_pd(&q_dbl[(ldq*2)+8]);
//q[ldq+0] += (x1 + (y1*h2));
//q[ldq+1] += (x2 + (y2*h2));
//q[ldq+2] += (x3 + (y3*h2));
//q[ldq+3] += (x4 + (y4*h2));
q1 = _mm256_add_pd(q1, x1);
q2 = _mm256_add_pd(q2, x2);
q3 = _mm256_add_pd(q3, x3);
tmp1 = _mm256_mul_pd(h2_imag, y1);
q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
tmp2 = _mm256_mul_pd(h2_imag, y2);
q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
tmp3 = _mm256_mul_pd(h2_imag, y3);
q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
_mm256_store_pd(&q_dbl[(ldq*2)+0], q1);
_mm256_store_pd(&q_dbl[(ldq*2)+4], q2);
_mm256_store_pd(&q_dbl[(ldq*2)+8], q3);
for (i = 2; i < nb; i++)
{
//q[(i*ldq)+0] += ((x1*h1) + (y1*h2));
//q[(i*ldq)+1] += ((x2*h1) + (y2*h2));
//q[(i*ldq)+2] += ((x3*h1) + (y3*h2));
//q[(i*ldq)+3] += ((x4*h1) + (y4*h2));
q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
//h1 = hh[i-1];
h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
tmp1 = _mm256_mul_pd(h1_imag, x1);
q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
tmp2 = _mm256_mul_pd(h1_imag, x2);
q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
tmp3 = _mm256_mul_pd(h1_imag, x3);
q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
//h2 = hh[ldh+i];
h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
tmp1 = _mm256_mul_pd(h2_imag, y1);
q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
tmp2 = _mm256_mul_pd(h2_imag, y2);
q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
tmp3 = _mm256_mul_pd(h2_imag, y3);
q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
_mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3);
}
//h1 = hh[nb-1];
//q[(nb*ldq)+0] += (x1*h1);
//q[(nb*ldq)+1] += (x2*h1);
//q[(nb*ldq)+2] += (x3*h1);
//q[(nb*ldq)+3] += (x4*h1);
h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]);
q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]);
tmp1 = _mm256_mul_pd(h1_imag, x1);
q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
tmp2 = _mm256_mul_pd(h1_imag, x2);
q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
tmp3 = _mm256_mul_pd(h1_imag, x3);
q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
_mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
_mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2);
_mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3);
}
extern "C" __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
......@@ -698,6 +975,147 @@ extern "C" __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<dou
_mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
_mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2);
}
extern "C" __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
double* s_dbl = (double*)(&s);
__m256d x1;
__m256d y1;
__m256d q1;
__m256d h1_real, h1_imag, h2_real, h2_imag;
__m256d tmp1;
int i=0;
__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]);
h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
// conjugate
h2_imag = _mm256_xor_pd(h2_imag, sign);
y1 = _mm256_load_pd(&q_dbl[0]);
tmp1 = _mm256_mul_pd(h2_imag, x1);
y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
for (i = 2; i < nb; i++)
{
q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
// conjugate
h1_imag = _mm256_xor_pd(h1_imag, sign);
tmp1 = _mm256_mul_pd(h1_imag, q1);
x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
// conjugate
h2_imag = _mm256_xor_pd(h2_imag, sign);
tmp1 = _mm256_mul_pd(h2_imag, q1);
y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
}
h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
// conjugate
h1_imag = _mm256_xor_pd(h1_imag, sign);
q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
tmp1 = _mm256_mul_pd(h1_imag, q1);
x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
h1_real = _mm256_xor_pd(h1_real, sign);
h1_imag = _mm256_xor_pd(h1_imag, sign);
tmp1 = _mm256_mul_pd(h1_imag, x1);
x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
h2_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
h1_real = _mm256_xor_pd(h1_real, sign);
h1_imag = _mm256_xor_pd(h1_imag, sign);
h2_real = _mm256_xor_pd(h2_real, sign);
h2_imag = _mm256_xor_pd(h2_imag, sign);
//h2 *= s;
__m128d tmp_s_128 = _mm_loadu_pd(s_dbl);
__m256d tmp2 = _mm256_broadcast_pd(&tmp_s_128);
tmp1 = _mm256_mul_pd(h2_imag, tmp2);
tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
_mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2));
h2_real = _mm256_broadcast_sd(&s_dbl[0]);
h2_imag = _mm256_broadcast_sd(&s_dbl[1]);
tmp1 = _mm256_mul_pd(h1_imag, y1);
y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
tmp1 = _mm256_mul_pd(h2_imag, x1);
y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
q1 = _mm256_load_pd(&q_dbl[0]);
q1 = _mm256_add_pd(q1, y1);
_mm256_store_pd(&q_dbl[0], q1);
//h2 = hh[ldh+1];
h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
q1 = _mm256_load_pd(&q_dbl[(ldq*2)+0]);
q1 = _mm256_add_pd(q1, x1);
tmp1 = _mm256_mul_pd(h2_imag, y1);
q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
_mm256_store_pd(&q_dbl[(ldq*2)+0], q1);
for (i = 2; i < nb; i++)
{
q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
//h1 = hh[i-1];
h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
tmp1 = _mm256_mul_pd(h1_imag, x1);
q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
//h2 = hh[ldh+i];
h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
tmp1 = _mm256_mul_pd(h2_imag, y1);
q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
}
h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
tmp1 = _mm256_mul_pd(h1_imag, x1);
q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
_mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
}
#else
extern "C" __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
......
......@@ -4,9 +4,9 @@
# Settings for Intel Fortran (Linux), Intel Composer XE 2011 (ifort 12.1) with AVX for Sandy Bridge:
#
X86=1
F90=mpiifort -O3 -traceback -fpe0
F90OPT=$(F90) -mavx
#CC=mpiicc -O3
F90=mpiifort -O3 -traceback -fpe0 -g -mavx
F90OPT=$(F90)
#CC=icc -O3
#CCOPT=$(CC) -mavx
CC=gcc -O3
CCOPT=$(CC) -mavx -funsafe-loop-optimizations -funsafe-math-optimizations -ftree-vect-loop-version -ftree-vectorize
......@@ -17,7 +17,7 @@ LIBS = -mkl=sequential -L$(MKL_HOME) -lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_l
# Settings for Intel Fortran (Linux), Intel Composer XE 2011 (ifort 12.1) with SSE3:
#
#X86=1
#F90=mpiifort -O3 -traceback -fpe0 -g
#F90=mpiifort -O3 -traceback -fpe0 -g -msse3
#F90OPT=$(F90) -msse3
#CC=gcc -O3
#CCOPT=$(CC) -msse3 -funsafe-loop-optimizations -funsafe-math-optimizations -ftree-vect-loop-version -ftree-vectorize
......@@ -30,7 +30,7 @@ LIBS = -mkl=sequential -L$(MKL_HOME) -lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_l
# Settings for Intel Fortran (Linux), Intel Composer XE 2011 (ifort 12.1) and GCC 4.6 with FMA4 for AMD Bulldozer:
#
#X86=1
#F90=mpiifort -O3 -traceback -fpe0 -g
#F90=mpiifort -O3 -traceback -fpe0 -g -msse3
#CC=gcc -O3
#F90OPT=$(F90) -msse3
#CCOPT=$(CC) -funsafe-loop-optimizations -funsafe-math-optimizations -ftree-vect-loop-version -ftree-vectorize -mfma4 -mxop -march=bdver1 -D__USE_AVX128__
......