Commit d081b04a authored by Alexander Heinecke's avatar Alexander Heinecke
Browse files

added FMA4 for one routine of 2hv complex kernels

parent 0c4f9edb
...@@ -149,6 +149,16 @@ extern "C" void double_hh_trafo_complex_(std::complex<double>* q, std::complex<d ...@@ -149,6 +149,16 @@ extern "C" void double_hh_trafo_complex_(std::complex<double>* q, std::complex<d
} }
#ifdef __AVX__ #ifdef __AVX__
#if 1
for (i = 0; i < nq-4; i+=8)
{
hh_trafo_complex_kernel_8_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
}
if (nq-i > 0)
{
hh_trafo_complex_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
}
#else
for (i = 0; i < nq-4; i+=6) for (i = 0; i < nq-4; i+=6)
{ {
hh_trafo_complex_kernel_6_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); hh_trafo_complex_kernel_6_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
...@@ -161,6 +171,7 @@ extern "C" void double_hh_trafo_complex_(std::complex<double>* q, std::complex<d ...@@ -161,6 +171,7 @@ extern "C" void double_hh_trafo_complex_(std::complex<double>* q, std::complex<d
{ {
hh_trafo_complex_kernel_2_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); hh_trafo_complex_kernel_2_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
} }
#endif
#else #else
for (i = 0; i < nq; i+=4) for (i = 0; i < nq; i+=4)
{ {
...@@ -192,8 +203,10 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -192,8 +203,10 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]); h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
#ifndef __FMA4__
// conjugate // conjugate
h2_imag = _mm256_xor_pd(h2_imag, sign); h2_imag = _mm256_xor_pd(h2_imag, sign);
#endif
y1 = _mm256_load_pd(&q_dbl[0]); y1 = _mm256_load_pd(&q_dbl[0]);
y2 = _mm256_load_pd(&q_dbl[4]); y2 = _mm256_load_pd(&q_dbl[4]);
...@@ -201,13 +214,29 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -201,13 +214,29 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
y4 = _mm256_load_pd(&q_dbl[12]); y4 = _mm256_load_pd(&q_dbl[12]);
tmp1 = _mm256_mul_pd(h2_imag, x1); tmp1 = _mm256_mul_pd(h2_imag, x1);
#ifdef __FMA4__
y1 = _mm256_add_pd(y1, _mm256_msubadd_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#endif
tmp2 = _mm256_mul_pd(h2_imag, x2); tmp2 = _mm256_mul_pd(h2_imag, x2);
#ifdef __FMA4__
y2 = _mm256_add_pd(y2, _mm256_msubadd_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#endif
tmp3 = _mm256_mul_pd(h2_imag, x3); tmp3 = _mm256_mul_pd(h2_imag, x3);
#ifdef __FMA4__
y3 = _mm256_add_pd(y3, _mm256_msubadd_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#endif
tmp4 = _mm256_mul_pd(h2_imag, x4); tmp4 = _mm256_mul_pd(h2_imag, x4);
#ifdef __FMA4__
y4 = _mm256_add_pd(y4, _mm256_msubadd_pd(h2_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#else
y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#endif
for (i = 2; i < nb; i++) for (i = 2; i < nb; i++)
{ {
...@@ -218,37 +247,75 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -218,37 +247,75 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]); h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
#ifndef __FMA4__
// conjugate // conjugate
h1_imag = _mm256_xor_pd(h1_imag, sign); h1_imag = _mm256_xor_pd(h1_imag, sign);
#endif
tmp1 = _mm256_mul_pd(h1_imag, q1); tmp1 = _mm256_mul_pd(h1_imag, q1);
#ifdef __FMA4__
x1 = _mm256_add_pd(x1, _mm256_msubadd_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#endif
tmp2 = _mm256_mul_pd(h1_imag, q2); tmp2 = _mm256_mul_pd(h1_imag, q2);
#ifdef __FMA4__
x2 = _mm256_add_pd(x2, _mm256_msubadd_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#endif
tmp3 = _mm256_mul_pd(h1_imag, q3); tmp3 = _mm256_mul_pd(h1_imag, q3);
#ifdef __FMA4__
x3 = _mm256_add_pd(x3, _mm256_msubadd_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#endif
tmp4 = _mm256_mul_pd(h1_imag, q4); tmp4 = _mm256_mul_pd(h1_imag, q4);
#ifdef __FMA4__
x4 = _mm256_add_pd(x4, _mm256_msubadd_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#else
x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#endif
h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]); h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
#ifndef __FMA4__
// conjugate // conjugate
h2_imag = _mm256_xor_pd(h2_imag, sign); h2_imag = _mm256_xor_pd(h2_imag, sign);
#endif
tmp1 = _mm256_mul_pd(h2_imag, q1); tmp1 = _mm256_mul_pd(h2_imag, q1);
#ifdef __FMA4__
y1 = _mm256_add_pd(y1, _mm256_msubadd_pd(h2_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#endif
tmp2 = _mm256_mul_pd(h2_imag, q2); tmp2 = _mm256_mul_pd(h2_imag, q2);
#ifdef __FMA4__
y2 = _mm256_add_pd(y2, _mm256_msubadd_pd(h2_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#endif
tmp3 = _mm256_mul_pd(h2_imag, q3); tmp3 = _mm256_mul_pd(h2_imag, q3);
#ifdef __FMA4__
y3 = _mm256_add_pd(y3, _mm256_msubadd_pd(h2_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#endif
tmp4 = _mm256_mul_pd(h2_imag, q4); tmp4 = _mm256_mul_pd(h2_imag, q4);
#ifdef __FMA4__
y4 = _mm256_add_pd(y4, _mm256_msubadd_pd(h2_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#else
y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#endif
} }
h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]); h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
#ifndef __FMA4__
// conjugate // conjugate
h1_imag = _mm256_xor_pd(h1_imag, sign); h1_imag = _mm256_xor_pd(h1_imag, sign);
#endif
q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]); q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]); q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]);
...@@ -256,13 +323,29 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -256,13 +323,29 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
q4 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+12]); q4 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+12]);
tmp1 = _mm256_mul_pd(h1_imag, q1); tmp1 = _mm256_mul_pd(h1_imag, q1);
#ifdef __FMA4__
x1 = _mm256_add_pd(x1, _mm256_msubadd_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#endif
tmp2 = _mm256_mul_pd(h1_imag, q2); tmp2 = _mm256_mul_pd(h1_imag, q2);
#ifdef __FMA4__
x2 = _mm256_add_pd(x2, _mm256_msubadd_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#endif
tmp3 = _mm256_mul_pd(h1_imag, q3); tmp3 = _mm256_mul_pd(h1_imag, q3);
#ifdef __FMA4__
x3 = _mm256_add_pd(x3, _mm256_msubadd_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#endif
tmp4 = _mm256_mul_pd(h1_imag, q4); tmp4 = _mm256_mul_pd(h1_imag, q4);
#ifdef __FMA4__
x4 = _mm256_add_pd(x4, _mm256_msubadd_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#else
x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#endif
h1_real = _mm256_broadcast_sd(&hh_dbl[0]); h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[1]); h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
...@@ -270,13 +353,29 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -270,13 +353,29 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
h1_imag = _mm256_xor_pd(h1_imag, sign); h1_imag = _mm256_xor_pd(h1_imag, sign);
tmp1 = _mm256_mul_pd(h1_imag, x1); tmp1 = _mm256_mul_pd(h1_imag, x1);
#ifdef __FMA4__
x1 = _mm256_maddsub_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
#else
x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
#endif
tmp2 = _mm256_mul_pd(h1_imag, x2); tmp2 = _mm256_mul_pd(h1_imag, x2);
#ifdef __FMA4__
x2 = _mm256_maddsub_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
#else
x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)); x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
#endif
tmp3 = _mm256_mul_pd(h1_imag, x3); tmp3 = _mm256_mul_pd(h1_imag, x3);
#ifdef __FMA4__
x3 = _mm256_maddsub_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5));
#else
x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)); x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
#endif
tmp4 = _mm256_mul_pd(h1_imag, x4); tmp4 = _mm256_mul_pd(h1_imag, x4);
#ifdef __FMA4__
x4 = _mm256_maddsub_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5));
#else
x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)); x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5));
#endif
h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]); h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]); h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
...@@ -291,28 +390,64 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -291,28 +390,64 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
__m128d tmp_s_128 = _mm_loadu_pd(s_dbl); __m128d tmp_s_128 = _mm_loadu_pd(s_dbl);
tmp2 = _mm256_broadcast_pd(&tmp_s_128); tmp2 = _mm256_broadcast_pd(&tmp_s_128);
tmp1 = _mm256_mul_pd(h2_imag, tmp2); tmp1 = _mm256_mul_pd(h2_imag, tmp2);
#ifdef __FMA4__
tmp2 = _mm256_maddsub_pd(h2_real, tmp2, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
#else
tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
#endif
_mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2)); _mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2));
h2_real = _mm256_broadcast_sd(&s_dbl[0]); h2_real = _mm256_broadcast_sd(&s_dbl[0]);
h2_imag = _mm256_broadcast_sd(&s_dbl[1]); h2_imag = _mm256_broadcast_sd(&s_dbl[1]);
tmp1 = _mm256_mul_pd(h1_imag, y1); tmp1 = _mm256_mul_pd(h1_imag, y1);
#ifdef __FMA4__
y1 = _mm256_maddsub_pd(h1_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
#else
y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
#endif
tmp2 = _mm256_mul_pd(h1_imag, y2); tmp2 = _mm256_mul_pd(h1_imag, y2);
#ifdef __FMA4__
y2 = _mm256_maddsub_pd(h1_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
#else
y2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)); y2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
#endif
tmp3 = _mm256_mul_pd(h1_imag, y3); tmp3 = _mm256_mul_pd(h1_imag, y3);
#ifdef __FMA4__
y3 = _mm256_maddsub_pd(h1_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5));
#else
y3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)); y3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
#endif
tmp4 = _mm256_mul_pd(h1_imag, y4); tmp4 = _mm256_mul_pd(h1_imag, y4);
#ifdef __FMA4__
y4 = _mm256_maddsub_pd(h1_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0x5));
#else
y4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)); y4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5));
#endif
tmp1 = _mm256_mul_pd(h2_imag, x1); tmp1 = _mm256_mul_pd(h2_imag, x1);
#ifdef __FMA4__
y1 = _mm256_add_pd(y1, _mm256_maddsub_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#endif
tmp2 = _mm256_mul_pd(h2_imag, x2); tmp2 = _mm256_mul_pd(h2_imag, x2);
#ifdef __FMA4__
y2 = _mm256_add_pd(y2, _mm256_maddsub_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#endif
tmp3 = _mm256_mul_pd(h2_imag, x3); tmp3 = _mm256_mul_pd(h2_imag, x3);
#ifdef __FMA4__
y3 = _mm256_add_pd(y3, _mm256_maddsub_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#endif
tmp4 = _mm256_mul_pd(h2_imag, x4); tmp4 = _mm256_mul_pd(h2_imag, x4);
#ifdef __FMA4__
y4 = _mm256_add_pd(y4, _mm256_maddsub_pd(h2_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#else
y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#endif
q1 = _mm256_load_pd(&q_dbl[0]); q1 = _mm256_load_pd(&q_dbl[0]);
q2 = _mm256_load_pd(&q_dbl[4]); q2 = _mm256_load_pd(&q_dbl[4]);
...@@ -343,13 +478,29 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -343,13 +478,29 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
q4 = _mm256_add_pd(q4, x4); q4 = _mm256_add_pd(q4, x4);
tmp1 = _mm256_mul_pd(h2_imag, y1); tmp1 = _mm256_mul_pd(h2_imag, y1);
#ifdef __FMA4__
q1 = _mm256_add_pd(q1, _mm256_maddsub_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#endif
tmp2 = _mm256_mul_pd(h2_imag, y2); tmp2 = _mm256_mul_pd(h2_imag, y2);
#ifdef __FMA4_
q2 = _mm256_add_pd(q2, _mm256_maddsub_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#endif
tmp3 = _mm256_mul_pd(h2_imag, y3); tmp3 = _mm256_mul_pd(h2_imag, y3);
#ifdef __FMA4__
q3 = _mm256_add_pd(q3, _mm256_maddsub_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#endif
tmp4 = _mm256_mul_pd(h2_imag, y4); tmp4 = _mm256_mul_pd(h2_imag, y4);
#ifdef __FMA4__
q4 = _mm256_add_pd(q4, _mm256_maddsub_pd(h2_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#else
q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#endif
_mm256_store_pd(&q_dbl[(ldq*2)+0], q1); _mm256_store_pd(&q_dbl[(ldq*2)+0], q1);
_mm256_store_pd(&q_dbl[(ldq*2)+4], q2); _mm256_store_pd(&q_dbl[(ldq*2)+4], q2);
...@@ -367,25 +518,57 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -367,25 +518,57 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
tmp1 = _mm256_mul_pd(h1_imag, x1); tmp1 = _mm256_mul_pd(h1_imag, x1);
#ifdef __FMA4__
q1 = _mm256_add_pd(q1, _mm256_maddsub_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#endif
tmp2 = _mm256_mul_pd(h1_imag, x2); tmp2 = _mm256_mul_pd(h1_imag, x2);
#ifdef __FMA4__
q2 = _mm256_add_pd(q2, _mm256_maddsub_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#endif
tmp3 = _mm256_mul_pd(h1_imag, x3); tmp3 = _mm256_mul_pd(h1_imag, x3);
#ifdef __FMA4__
q3 = _mm256_add_pd(q3, _mm256_maddsub_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#endif
tmp4 = _mm256_mul_pd(h1_imag, x4); tmp4 = _mm256_mul_pd(h1_imag, x4);
#ifdef __FMA4__
q4 = _mm256_add_pd(q4, _mm256_maddsub_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#else
q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#endif
h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]); h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
tmp1 = _mm256_mul_pd(h2_imag, y1); tmp1 = _mm256_mul_pd(h2_imag, y1);
#ifdef __FMA4__
q1 = _mm256_add_pd(q1, _mm256_maddsub_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#endif
tmp2 = _mm256_mul_pd(h2_imag, y2); tmp2 = _mm256_mul_pd(h2_imag, y2);
#ifdef __FMA4__
q2 = _mm256_add_pd(q2, _mm256_maddsub_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#endif
tmp3 = _mm256_mul_pd(h2_imag, y3); tmp3 = _mm256_mul_pd(h2_imag, y3);
#ifdef __FMA4__
q3 = _mm256_add_pd(q3, _mm256_maddsub_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#endif
tmp4 = _mm256_mul_pd(h2_imag, y4); tmp4 = _mm256_mul_pd(h2_imag, y4);
#ifdef __FMA4__
q4 = _mm256_add_pd(q4, _mm256_maddsub_pd(h2_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#else
q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#endif
_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1); _mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2); _mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
...@@ -401,13 +584,29 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -401,13 +584,29 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
q4 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+12]); q4 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+12]);
tmp1 = _mm256_mul_pd(h1_imag, x1); tmp1 = _mm256_mul_pd(h1_imag, x1);
#ifdef __FMA4__
q1 = _mm256_add_pd(q1, _mm256_maddsub_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#else
q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#endif
tmp2 = _mm256_mul_pd(h1_imag, x2); tmp2 = _mm256_mul_pd(h1_imag, x2);
#ifdef __FMA4__
q2 = _mm256_add_pd(q2, _mm256_maddsub_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#else
q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#endif
tmp3 = _mm256_mul_pd(h1_imag, x3); tmp3 = _mm256_mul_pd(h1_imag, x3);
#ifdef __FMA4__
q3 = _mm256_add_pd(q3, _mm256_maddsub_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#else
q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#endif
tmp4 = _mm256_mul_pd(h1_imag, x4); tmp4 = _mm256_mul_pd(h1_imag, x4);
#ifdef __FMA4__
q4 = _mm256_add_pd(q4, _mm256_maddsub_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#else
q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#endif
_mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1); _mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
_mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2); _mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment