Commit 0c4f9edb authored by Alexander Heinecke's avatar Alexander Heinecke
Browse files

removed un-nessecary comments in complex 2hv kernel file

parent 246a6d87
...@@ -25,12 +25,15 @@ ...@@ -25,12 +25,15 @@
//Forward declaration //Forward declaration
#ifdef __AVX__ #ifdef __AVX__
//extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
extern "C" __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); extern "C" __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
extern "C" __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); extern "C" __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
#else #else
extern "C" __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); extern "C" __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
extern "C" __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
extern "C" __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
extern "C" __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
#endif #endif
#if 0 #if 0
...@@ -167,7 +170,6 @@ extern "C" void double_hh_trafo_complex_(std::complex<double>* q, std::complex<d ...@@ -167,7 +170,6 @@ extern "C" void double_hh_trafo_complex_(std::complex<double>* q, std::complex<d
} }
#ifdef __AVX__ #ifdef __AVX__
#if 0
extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s) extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
...@@ -183,25 +185,16 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -183,25 +185,16 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); __m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
//x1 = q[ldq+0];
//x2 = q[ldq+1];
//x3 = q[ldq+2];
//x4 = q[ldq+3];
x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]); x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]);
x2 = _mm256_load_pd(&q_dbl[(2*ldq)+4]); x2 = _mm256_load_pd(&q_dbl[(2*ldq)+4]);
x3 = _mm256_load_pd(&q_dbl[(2*ldq)+8]); x3 = _mm256_load_pd(&q_dbl[(2*ldq)+8]);
x4 = _mm256_load_pd(&q_dbl[(2*ldq)+12]); x4 = _mm256_load_pd(&q_dbl[(2*ldq)+12]);
//h2 = conj(hh[ldh+1]);
h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]); h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
// conjugate // conjugate
h2_imag = _mm256_xor_pd(h2_imag, sign); h2_imag = _mm256_xor_pd(h2_imag, sign);
//y1 = q[0] + (x1*h2);
//y2 = q[1] + (x2*h2);
//y3 = q[2] + (x3*h2);
//y4 = q[3] + (x4*h2);
y1 = _mm256_load_pd(&q_dbl[0]); y1 = _mm256_load_pd(&q_dbl[0]);
y2 = _mm256_load_pd(&q_dbl[4]); y2 = _mm256_load_pd(&q_dbl[4]);
y3 = _mm256_load_pd(&q_dbl[8]); y3 = _mm256_load_pd(&q_dbl[8]);
...@@ -218,17 +211,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -218,17 +211,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
for (i = 2; i < nb; i++) for (i = 2; i < nb; i++)
{ {
//h1 = conj(hh[i-1]);
//h2 = conj(hh[ldh+i]);
//x1 += (q[(i*ldq)+0] * h1);
//y1 += (q[(i*ldq)+0] * h2);
//x2 += (q[(i*ldq)+1] * h1);
//y2 += (q[(i*ldq)+1] * h2);
//x3 += (q[(i*ldq)+2] * h1);
//y3 += (q[(i*ldq)+2] * h2);
//x4 += (q[(i*ldq)+3] * h1);
//y4 += (q[(i*ldq)+3] * h2);
q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]); q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
...@@ -263,11 +245,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -263,11 +245,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
} }
//h1 = conj(hh[nb-1]);
//x1 += (q[(nb*ldq)+0] * h1);
//x2 += (q[(nb*ldq)+1] * h1);
//x3 += (q[(nb*ldq)+2] * h1);
//x4 += (q[(nb*ldq)+3] * h1);
h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]); h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
// conjugate // conjugate
...@@ -287,17 +264,11 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -287,17 +264,11 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
tmp4 = _mm256_mul_pd(h1_imag, q4); tmp4 = _mm256_mul_pd(h1_imag, q4);
x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
//tau1 = hh[0];
//h1 = (-1.0)*tau1;
h1_real = _mm256_broadcast_sd(&hh_dbl[0]); h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[1]); h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
h1_real = _mm256_xor_pd(h1_real, sign); h1_real = _mm256_xor_pd(h1_real, sign);
h1_imag = _mm256_xor_pd(h1_imag, sign); h1_imag = _mm256_xor_pd(h1_imag, sign);
//x1 *= h1;
//x2 *= h1;
//x3 *= h1;
//x4 *= h1;
tmp1 = _mm256_mul_pd(h1_imag, x1); tmp1 = _mm256_mul_pd(h1_imag, x1);
x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
tmp2 = _mm256_mul_pd(h1_imag, x2); tmp2 = _mm256_mul_pd(h1_imag, x2);
...@@ -307,9 +278,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -307,9 +278,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
tmp4 = _mm256_mul_pd(h1_imag, x4); tmp4 = _mm256_mul_pd(h1_imag, x4);
x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)); x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5));
//tau2 = hh[ldh];
//h1 = (-1.0)*tau2;
//h2 = (-1.0)*tau2;
h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]); h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]); h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]); h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
...@@ -320,7 +288,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -320,7 +288,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
h2_real = _mm256_xor_pd(h2_real, sign); h2_real = _mm256_xor_pd(h2_real, sign);
h2_imag = _mm256_xor_pd(h2_imag, sign); h2_imag = _mm256_xor_pd(h2_imag, sign);
//h2 *= s;
__m128d tmp_s_128 = _mm_loadu_pd(s_dbl); __m128d tmp_s_128 = _mm_loadu_pd(s_dbl);
tmp2 = _mm256_broadcast_pd(&tmp_s_128); tmp2 = _mm256_broadcast_pd(&tmp_s_128);
tmp1 = _mm256_mul_pd(h2_imag, tmp2); tmp1 = _mm256_mul_pd(h2_imag, tmp2);
...@@ -329,10 +296,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -329,10 +296,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
h2_real = _mm256_broadcast_sd(&s_dbl[0]); h2_real = _mm256_broadcast_sd(&s_dbl[0]);
h2_imag = _mm256_broadcast_sd(&s_dbl[1]); h2_imag = _mm256_broadcast_sd(&s_dbl[1]);
//y1 = y1*h1 +x1*h2;
//y2 = y2*h1 +x2*h2;
//y3 = y3*h1 +x3*h2;
//y4 = y4*h1 +x4*h2;
tmp1 = _mm256_mul_pd(h1_imag, y1); tmp1 = _mm256_mul_pd(h1_imag, y1);
y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
tmp2 = _mm256_mul_pd(h1_imag, y2); tmp2 = _mm256_mul_pd(h1_imag, y2);
...@@ -342,7 +305,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -342,7 +305,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
tmp4 = _mm256_mul_pd(h1_imag, y4); tmp4 = _mm256_mul_pd(h1_imag, y4);
y4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)); y4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5));
// y1+=x1*h2
tmp1 = _mm256_mul_pd(h2_imag, x1); tmp1 = _mm256_mul_pd(h2_imag, x1);
y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
tmp2 = _mm256_mul_pd(h2_imag, x2); tmp2 = _mm256_mul_pd(h2_imag, x2);
...@@ -357,10 +319,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -357,10 +319,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
q3 = _mm256_load_pd(&q_dbl[8]); q3 = _mm256_load_pd(&q_dbl[8]);
q4 = _mm256_load_pd(&q_dbl[12]); q4 = _mm256_load_pd(&q_dbl[12]);
//q[0] += y1;
//q[1] += y2;
//q[2] += y3;
//q[3] += y4;
q1 = _mm256_add_pd(q1, y1); q1 = _mm256_add_pd(q1, y1);
q2 = _mm256_add_pd(q2, y2); q2 = _mm256_add_pd(q2, y2);
q3 = _mm256_add_pd(q3, y3); q3 = _mm256_add_pd(q3, y3);
...@@ -371,7 +329,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -371,7 +329,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
_mm256_store_pd(&q_dbl[8], q3); _mm256_store_pd(&q_dbl[8], q3);
_mm256_store_pd(&q_dbl[12], q4); _mm256_store_pd(&q_dbl[12], q4);
//h2 = hh[ldh+1];
h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]); h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
...@@ -380,10 +337,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -380,10 +337,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
q3 = _mm256_load_pd(&q_dbl[(ldq*2)+8]); q3 = _mm256_load_pd(&q_dbl[(ldq*2)+8]);
q4 = _mm256_load_pd(&q_dbl[(ldq*2)+12]); q4 = _mm256_load_pd(&q_dbl[(ldq*2)+12]);
//q[ldq+0] += (x1 + (y1*h2));
//q[ldq+1] += (x2 + (y2*h2));
//q[ldq+2] += (x3 + (y3*h2));
//q[ldq+3] += (x4 + (y4*h2));
q1 = _mm256_add_pd(q1, x1); q1 = _mm256_add_pd(q1, x1);
q2 = _mm256_add_pd(q2, x2); q2 = _mm256_add_pd(q2, x2);
q3 = _mm256_add_pd(q3, x3); q3 = _mm256_add_pd(q3, x3);
...@@ -405,16 +358,11 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -405,16 +358,11 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
for (i = 2; i < nb; i++) for (i = 2; i < nb; i++)
{ {
//q[(i*ldq)+0] += ((x1*h1) + (y1*h2));
//q[(i*ldq)+1] += ((x2*h1) + (y2*h2));
//q[(i*ldq)+2] += ((x3*h1) + (y3*h2));
//q[(i*ldq)+3] += ((x4*h1) + (y4*h2));
q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]); q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]); q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
//h1 = hh[i-1];
h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]); h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
...@@ -427,7 +375,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -427,7 +375,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
tmp4 = _mm256_mul_pd(h1_imag, x4); tmp4 = _mm256_mul_pd(h1_imag, x4);
q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
//h2 = hh[ldh+i];
h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]); h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
...@@ -445,11 +392,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -445,11 +392,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
_mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3); _mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3);
_mm256_store_pd(&q_dbl[(2*i*ldq)+12], q4); _mm256_store_pd(&q_dbl[(2*i*ldq)+12], q4);
} }
//h1 = hh[nb-1];
//q[(nb*ldq)+0] += (x1*h1);
//q[(nb*ldq)+1] += (x2*h1);
//q[(nb*ldq)+2] += (x3*h1);
//q[(nb*ldq)+3] += (x4*h1);
h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]); h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
...@@ -472,7 +414,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou ...@@ -472,7 +414,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
_mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3); _mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3);
_mm256_store_pd(&q_dbl[(2*nb*ldq)+12], q4); _mm256_store_pd(&q_dbl[(2*nb*ldq)+12], q4);
} }
#endif
extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s) extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{ {
...@@ -489,24 +430,15 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou ...@@ -489,24 +430,15 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); __m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
//x1 = q[ldq+0];
//x2 = q[ldq+1];
//x3 = q[ldq+2];
//x4 = q[ldq+3];
x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]); x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]);
x2 = _mm256_load_pd(&q_dbl[(2*ldq)+4]); x2 = _mm256_load_pd(&q_dbl[(2*ldq)+4]);
x3 = _mm256_load_pd(&q_dbl[(2*ldq)+8]); x3 = _mm256_load_pd(&q_dbl[(2*ldq)+8]);
//h2 = conj(hh[ldh+1]);
h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]); h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
// conjugate // conjugate
h2_imag = _mm256_xor_pd(h2_imag, sign); h2_imag = _mm256_xor_pd(h2_imag, sign);
//y1 = q[0] + (x1*h2);
//y2 = q[1] + (x2*h2);
//y3 = q[2] + (x3*h2);
//y4 = q[3] + (x4*h2);
y1 = _mm256_load_pd(&q_dbl[0]); y1 = _mm256_load_pd(&q_dbl[0]);
y2 = _mm256_load_pd(&q_dbl[4]); y2 = _mm256_load_pd(&q_dbl[4]);
y3 = _mm256_load_pd(&q_dbl[8]); y3 = _mm256_load_pd(&q_dbl[8]);
...@@ -520,17 +452,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou ...@@ -520,17 +452,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
for (i = 2; i < nb; i++) for (i = 2; i < nb; i++)
{ {
//h1 = conj(hh[i-1]);
//h2 = conj(hh[ldh+i]);
//x1 += (q[(i*ldq)+0] * h1);
//y1 += (q[(i*ldq)+0] * h2);
//x2 += (q[(i*ldq)+1] * h1);
//y2 += (q[(i*ldq)+1] * h2);
//x3 += (q[(i*ldq)+2] * h1);
//y3 += (q[(i*ldq)+2] * h2);
//x4 += (q[(i*ldq)+3] * h1);
//y4 += (q[(i*ldq)+3] * h2);
q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]); q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
...@@ -560,11 +481,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou ...@@ -560,11 +481,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
} }
//h1 = conj(hh[nb-1]);
//x1 += (q[(nb*ldq)+0] * h1);
//x2 += (q[(nb*ldq)+1] * h1);
//x3 += (q[(nb*ldq)+2] * h1);
//x4 += (q[(nb*ldq)+3] * h1);
h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]); h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
// conjugate // conjugate
...@@ -581,17 +497,11 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou ...@@ -581,17 +497,11 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
tmp3 = _mm256_mul_pd(h1_imag, q3); tmp3 = _mm256_mul_pd(h1_imag, q3);
x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
//tau1 = hh[0];
//h1 = (-1.0)*tau1;
h1_real = _mm256_broadcast_sd(&hh_dbl[0]); h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[1]); h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
h1_real = _mm256_xor_pd(h1_real, sign); h1_real = _mm256_xor_pd(h1_real, sign);
h1_imag = _mm256_xor_pd(h1_imag, sign); h1_imag = _mm256_xor_pd(h1_imag, sign);
//x1 *= h1;
//x2 *= h1;
//x3 *= h1;
//x4 *= h1;
tmp1 = _mm256_mul_pd(h1_imag, x1); tmp1 = _mm256_mul_pd(h1_imag, x1);
x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
tmp2 = _mm256_mul_pd(h1_imag, x2); tmp2 = _mm256_mul_pd(h1_imag, x2);
...@@ -599,9 +509,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou ...@@ -599,9 +509,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
tmp3 = _mm256_mul_pd(h1_imag, x3); tmp3 = _mm256_mul_pd(h1_imag, x3);
x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)); x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
//tau2 = hh[ldh];
//h1 = (-1.0)*tau2;
//h2 = (-1.0)*tau2;
h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]); h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]); h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]); h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
...@@ -612,7 +519,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou ...@@ -612,7 +519,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
h2_real = _mm256_xor_pd(h2_real, sign); h2_real = _mm256_xor_pd(h2_real, sign);
h2_imag = _mm256_xor_pd(h2_imag, sign); h2_imag = _mm256_xor_pd(h2_imag, sign);
//h2 *= s;
__m128d tmp_s_128 = _mm_loadu_pd(s_dbl); __m128d tmp_s_128 = _mm_loadu_pd(s_dbl);
tmp2 = _mm256_broadcast_pd(&tmp_s_128); tmp2 = _mm256_broadcast_pd(&tmp_s_128);
tmp1 = _mm256_mul_pd(h2_imag, tmp2); tmp1 = _mm256_mul_pd(h2_imag, tmp2);
...@@ -621,10 +527,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou ...@@ -621,10 +527,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
h2_real = _mm256_broadcast_sd(&s_dbl[0]); h2_real = _mm256_broadcast_sd(&s_dbl[0]);
h2_imag = _mm256_broadcast_sd(&s_dbl[1]); h2_imag = _mm256_broadcast_sd(&s_dbl[1]);
//y1 = y1*h1 +x1*h2;
//y2 = y2*h1 +x2*h2;
//y3 = y3*h1 +x3*h2;
//y4 = y4*h1 +x4*h2;
tmp1 = _mm256_mul_pd(h1_imag, y1); tmp1 = _mm256_mul_pd(h1_imag, y1);
y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
tmp2 = _mm256_mul_pd(h1_imag, y2); tmp2 = _mm256_mul_pd(h1_imag, y2);
...@@ -632,7 +534,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou ...@@ -632,7 +534,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
tmp3 = _mm256_mul_pd(h1_imag, y3); tmp3 = _mm256_mul_pd(h1_imag, y3);
y3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)); y3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
// y1+=x1*h2
tmp1 = _mm256_mul_pd(h2_imag, x1); tmp1 = _mm256_mul_pd(h2_imag, x1);
y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
tmp2 = _mm256_mul_pd(h2_imag, x2); tmp2 = _mm256_mul_pd(h2_imag, x2);
...@@ -644,10 +545,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou ...@@ -644,10 +545,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
q2 = _mm256_load_pd(&q_dbl[4]); q2 = _mm256_load_pd(&q_dbl[4]);
q3 = _mm256_load_pd(&q_dbl[8]); q3 = _mm256_load_pd(&q_dbl[8]);
//q[0] += y1;
//q[1] += y2;
//q[2] += y3;
//q[3] += y4;
q1 = _mm256_add_pd(q1, y1); q1 = _mm256_add_pd(q1, y1);
q2 = _mm256_add_pd(q2, y2); q2 = _mm256_add_pd(q2, y2);
q3 = _mm256_add_pd(q3, y3); q3 = _mm256_add_pd(q3, y3);
...@@ -656,7 +553,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou ...@@ -656,7 +553,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
_mm256_store_pd(&q_dbl[4], q2); _mm256_store_pd(&q_dbl[4], q2);
_mm256_store_pd(&q_dbl[8], q3); _mm256_store_pd(&q_dbl[8], q3);
//h2 = hh[ldh+1];
h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]); h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
...@@ -664,10 +560,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou ...@@ -664,10 +560,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
q2 = _mm256_load_pd(&q_dbl[(ldq*2)+4]); q2 = _mm256_load_pd(&q_dbl[(ldq*2)+4]);
q3 = _mm256_load_pd(&q_dbl[(ldq*2)+8]); q3 = _mm256_load_pd(&q_dbl[(ldq*2)+8]);
//q[ldq+0] += (x1 + (y1*h2));
//q[ldq+1] += (x2 + (y2*h2));
//q[ldq+2] += (x3 + (y3*h2));
//q[ldq+3] += (x4 + (y4*h2));
q1 = _mm256_add_pd(q1, x1); q1 = _mm256_add_pd(q1, x1);
q2 = _mm256_add_pd(q2, x2); q2 = _mm256_add_pd(q2, x2);
q3 = _mm256_add_pd(q3, x3); q3 = _mm256_add_pd(q3, x3);
...@@ -685,15 +577,10 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou ...@@ -685,15 +577,10 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
for (i = 2; i < nb; i++) for (i = 2; i < nb; i++)
{ {
//q[(i*ldq)+0] += ((x1*h1) + (y1*h2));
//q[(i*ldq)+1] += ((x2*h1) + (y2*h2));
//q[(i*ldq)+2] += ((x3*h1) + (y3*h2));
//q[(i*ldq)+3] += ((x4*h1) + (y4*h2));
q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]); q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
//h1 = hh[i-1];
h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]); h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
...@@ -704,7 +591,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou ...@@ -704,7 +591,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
tmp3 = _mm256_mul_pd(h1_imag, x3); tmp3 = _mm256_mul_pd(h1_imag, x3);
q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
//h2 = hh[ldh+i];
h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]); h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
...@@ -719,11 +605,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou ...@@ -719,11 +605,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2); _mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
_mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3); _mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3);
} }
//h1 = hh[nb-1];
//q[(nb*ldq)+0] += (x1*h1);
//q[(nb*ldq)+1] += (x2*h1);
//q[(nb*ldq)+2] += (x3*h1);