Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
0c4f9edb
Commit
0c4f9edb
authored
Jan 04, 2013
by
Alexander Heinecke
Browse files
removed un-nessecary comments in complex 2hv kernel file
parent
246a6d87
Changes
1
Show whitespace changes
Inline
Side-by-side
ELPA_2011.12.Intrinsics/src/elpa2_kernels/elpa2_tum_kernels_complex_sse-avx_2hv.cpp
View file @
0c4f9edb
...
...
@@ -25,12 +25,15 @@
//Forward declaration
#ifdef __AVX__
//
extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
extern
"C"
__forceinline
void
hh_trafo_complex_kernel_8_AVX_2hv
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
);
extern
"C"
__forceinline
void
hh_trafo_complex_kernel_6_AVX_2hv
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
);
extern
"C"
__forceinline
void
hh_trafo_complex_kernel_4_AVX_2hv
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
);
extern
"C"
__forceinline
void
hh_trafo_complex_kernel_2_AVX_2hv
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
);
#else
extern
"C"
__forceinline
void
hh_trafo_complex_kernel_4_SSE_2hv
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
);
extern
"C"
__forceinline
void
hh_trafo_complex_kernel_3_SSE_2hv
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
);
extern
"C"
__forceinline
void
hh_trafo_complex_kernel_2_SSE_2hv
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
);
extern
"C"
__forceinline
void
hh_trafo_complex_kernel_1_SSE_2hv
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
);
#endif
#if 0
...
...
@@ -167,7 +170,6 @@ extern "C" void double_hh_trafo_complex_(std::complex<double>* q, std::complex<d
}
#ifdef __AVX__
#if 0
extern
"C"
__forceinline
void
hh_trafo_complex_kernel_8_AVX_2hv
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
)
{
double
*
q_dbl
=
(
double
*
)
q
;
...
...
@@ -183,25 +185,16 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
__m256d
sign
=
(
__m256d
)
_mm256_set_epi64x
(
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
);
//x1 = q[ldq+0];
//x2 = q[ldq+1];
//x3 = q[ldq+2];
//x4 = q[ldq+3];
x1
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
ldq
)
+
0
]);
x2
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
ldq
)
+
4
]);
x3
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
ldq
)
+
8
]);
x4
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
ldq
)
+
12
]);
//h2 = conj(hh[ldh+1]);
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
+
1
)
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
// conjugate
h2_imag
=
_mm256_xor_pd
(
h2_imag
,
sign
);
//y1 = q[0] + (x1*h2);
//y2 = q[1] + (x2*h2);
//y3 = q[2] + (x3*h2);
//y4 = q[3] + (x4*h2);
y1
=
_mm256_load_pd
(
&
q_dbl
[
0
]);
y2
=
_mm256_load_pd
(
&
q_dbl
[
4
]);
y3
=
_mm256_load_pd
(
&
q_dbl
[
8
]);
...
...
@@ -218,17 +211,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
//h1 = conj(hh[i-1]);
//h2 = conj(hh[ldh+i]);
//x1 += (q[(i*ldq)+0] * h1);
//y1 += (q[(i*ldq)+0] * h2);
//x2 += (q[(i*ldq)+1] * h1);
//y2 += (q[(i*ldq)+1] * h2);
//x3 += (q[(i*ldq)+2] * h1);
//y3 += (q[(i*ldq)+2] * h2);
//x4 += (q[(i*ldq)+3] * h1);
//y4 += (q[(i*ldq)+3] * h2);
q1
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
0
]);
q2
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
4
]);
q3
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
8
]);
...
...
@@ -263,11 +245,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
y4
=
_mm256_add_pd
(
y4
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
q4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
}
//h1 = conj(hh[nb-1]);
//x1 += (q[(nb*ldq)+0] * h1);
//x2 += (q[(nb*ldq)+1] * h1);
//x3 += (q[(nb*ldq)+2] * h1);
//x4 += (q[(nb*ldq)+3] * h1);
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
nb
-
1
)
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
nb
-
1
)
*
2
)
+
1
]);
// conjugate
...
...
@@ -287,17 +264,11 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
tmp4
=
_mm256_mul_pd
(
h1_imag
,
q4
);
x4
=
_mm256_add_pd
(
x4
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
//tau1 = hh[0];
//h1 = (-1.0)*tau1;
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[
0
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[
1
]);
h1_real
=
_mm256_xor_pd
(
h1_real
,
sign
);
h1_imag
=
_mm256_xor_pd
(
h1_imag
,
sign
);
//x1 *= h1;
//x2 *= h1;
//x3 *= h1;
//x4 *= h1;
tmp1
=
_mm256_mul_pd
(
h1_imag
,
x1
);
x1
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
));
tmp2
=
_mm256_mul_pd
(
h1_imag
,
x2
);
...
...
@@ -307,9 +278,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
tmp4
=
_mm256_mul_pd
(
h1_imag
,
x4
);
x4
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
));
//tau2 = hh[ldh];
//h1 = (-1.0)*tau2;
//h2 = (-1.0)*tau2;
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[
ldh
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
*
2
)
+
1
]);
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[
ldh
*
2
]);
...
...
@@ -320,7 +288,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
h2_real
=
_mm256_xor_pd
(
h2_real
,
sign
);
h2_imag
=
_mm256_xor_pd
(
h2_imag
,
sign
);
//h2 *= s;
__m128d
tmp_s_128
=
_mm_loadu_pd
(
s_dbl
);
tmp2
=
_mm256_broadcast_pd
(
&
tmp_s_128
);
tmp1
=
_mm256_mul_pd
(
h2_imag
,
tmp2
);
...
...
@@ -329,10 +296,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
h2_real
=
_mm256_broadcast_sd
(
&
s_dbl
[
0
]);
h2_imag
=
_mm256_broadcast_sd
(
&
s_dbl
[
1
]);
//y1 = y1*h1 +x1*h2;
//y2 = y2*h1 +x2*h2;
//y3 = y3*h1 +x3*h2;
//y4 = y4*h1 +x4*h2;
tmp1
=
_mm256_mul_pd
(
h1_imag
,
y1
);
y1
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
y1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
));
tmp2
=
_mm256_mul_pd
(
h1_imag
,
y2
);
...
...
@@ -342,7 +305,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
tmp4
=
_mm256_mul_pd
(
h1_imag
,
y4
);
y4
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
y4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
));
// y1+=x1*h2
tmp1
=
_mm256_mul_pd
(
h2_imag
,
x1
);
y1
=
_mm256_add_pd
(
y1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
tmp2
=
_mm256_mul_pd
(
h2_imag
,
x2
);
...
...
@@ -357,10 +319,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
q3
=
_mm256_load_pd
(
&
q_dbl
[
8
]);
q4
=
_mm256_load_pd
(
&
q_dbl
[
12
]);
//q[0] += y1;
//q[1] += y2;
//q[2] += y3;
//q[3] += y4;
q1
=
_mm256_add_pd
(
q1
,
y1
);
q2
=
_mm256_add_pd
(
q2
,
y2
);
q3
=
_mm256_add_pd
(
q3
,
y3
);
...
...
@@ -371,7 +329,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
_mm256_store_pd
(
&
q_dbl
[
8
],
q3
);
_mm256_store_pd
(
&
q_dbl
[
12
],
q4
);
//h2 = hh[ldh+1];
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
+
1
)
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
...
...
@@ -380,10 +337,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
q3
=
_mm256_load_pd
(
&
q_dbl
[(
ldq
*
2
)
+
8
]);
q4
=
_mm256_load_pd
(
&
q_dbl
[(
ldq
*
2
)
+
12
]);
//q[ldq+0] += (x1 + (y1*h2));
//q[ldq+1] += (x2 + (y2*h2));
//q[ldq+2] += (x3 + (y3*h2));
//q[ldq+3] += (x4 + (y4*h2));
q1
=
_mm256_add_pd
(
q1
,
x1
);
q2
=
_mm256_add_pd
(
q2
,
x2
);
q3
=
_mm256_add_pd
(
q3
,
x3
);
...
...
@@ -405,16 +358,11 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
//q[(i*ldq)+0] += ((x1*h1) + (y1*h2));
//q[(i*ldq)+1] += ((x2*h1) + (y2*h2));
//q[(i*ldq)+2] += ((x3*h1) + (y3*h2));
//q[(i*ldq)+3] += ((x4*h1) + (y4*h2));
q1
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
0
]);
q2
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
4
]);
q3
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
8
]);
q4
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
12
]);
//h1 = hh[i-1];
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
i
-
1
)
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
i
-
1
)
*
2
)
+
1
]);
...
...
@@ -427,7 +375,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
tmp4
=
_mm256_mul_pd
(
h1_imag
,
x4
);
q4
=
_mm256_add_pd
(
q4
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
//h2 = hh[ldh+i];
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
+
i
)
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
ldh
+
i
)
*
2
)
+
1
]);
...
...
@@ -445,11 +392,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
_mm256_store_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
8
],
q3
);
_mm256_store_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
12
],
q4
);
}
//h1 = hh[nb-1];
//q[(nb*ldq)+0] += (x1*h1);
//q[(nb*ldq)+1] += (x2*h1);
//q[(nb*ldq)+2] += (x3*h1);
//q[(nb*ldq)+3] += (x4*h1);
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
nb
-
1
)
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
nb
-
1
)
*
2
)
+
1
]);
...
...
@@ -472,7 +414,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
_mm256_store_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
8
],
q3
);
_mm256_store_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
12
],
q4
);
}
#endif
extern
"C"
__forceinline
void
hh_trafo_complex_kernel_6_AVX_2hv
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
)
{
...
...
@@ -489,24 +430,15 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
__m256d
sign
=
(
__m256d
)
_mm256_set_epi64x
(
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
);
//x1 = q[ldq+0];
//x2 = q[ldq+1];
//x3 = q[ldq+2];
//x4 = q[ldq+3];
x1
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
ldq
)
+
0
]);
x2
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
ldq
)
+
4
]);
x3
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
ldq
)
+
8
]);
//h2 = conj(hh[ldh+1]);
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
+
1
)
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
// conjugate
h2_imag
=
_mm256_xor_pd
(
h2_imag
,
sign
);
//y1 = q[0] + (x1*h2);
//y2 = q[1] + (x2*h2);
//y3 = q[2] + (x3*h2);
//y4 = q[3] + (x4*h2);
y1
=
_mm256_load_pd
(
&
q_dbl
[
0
]);
y2
=
_mm256_load_pd
(
&
q_dbl
[
4
]);
y3
=
_mm256_load_pd
(
&
q_dbl
[
8
]);
...
...
@@ -520,17 +452,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
//h1 = conj(hh[i-1]);
//h2 = conj(hh[ldh+i]);
//x1 += (q[(i*ldq)+0] * h1);
//y1 += (q[(i*ldq)+0] * h2);
//x2 += (q[(i*ldq)+1] * h1);
//y2 += (q[(i*ldq)+1] * h2);
//x3 += (q[(i*ldq)+2] * h1);
//y3 += (q[(i*ldq)+2] * h2);
//x4 += (q[(i*ldq)+3] * h1);
//y4 += (q[(i*ldq)+3] * h2);
q1
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
0
]);
q2
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
4
]);
q3
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
8
]);
...
...
@@ -560,11 +481,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
y3
=
_mm256_add_pd
(
y3
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
q3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
}
//h1 = conj(hh[nb-1]);
//x1 += (q[(nb*ldq)+0] * h1);
//x2 += (q[(nb*ldq)+1] * h1);
//x3 += (q[(nb*ldq)+2] * h1);
//x4 += (q[(nb*ldq)+3] * h1);
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
nb
-
1
)
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
nb
-
1
)
*
2
)
+
1
]);
// conjugate
...
...
@@ -581,17 +497,11 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
tmp3
=
_mm256_mul_pd
(
h1_imag
,
q3
);
x3
=
_mm256_add_pd
(
x3
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
//tau1 = hh[0];
//h1 = (-1.0)*tau1;
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[
0
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[
1
]);
h1_real
=
_mm256_xor_pd
(
h1_real
,
sign
);
h1_imag
=
_mm256_xor_pd
(
h1_imag
,
sign
);
//x1 *= h1;
//x2 *= h1;
//x3 *= h1;
//x4 *= h1;
tmp1
=
_mm256_mul_pd
(
h1_imag
,
x1
);
x1
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
));
tmp2
=
_mm256_mul_pd
(
h1_imag
,
x2
);
...
...
@@ -599,9 +509,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
tmp3
=
_mm256_mul_pd
(
h1_imag
,
x3
);
x3
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
));
//tau2 = hh[ldh];
//h1 = (-1.0)*tau2;
//h2 = (-1.0)*tau2;
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[
ldh
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
*
2
)
+
1
]);
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[
ldh
*
2
]);
...
...
@@ -612,7 +519,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
h2_real
=
_mm256_xor_pd
(
h2_real
,
sign
);
h2_imag
=
_mm256_xor_pd
(
h2_imag
,
sign
);
//h2 *= s;
__m128d
tmp_s_128
=
_mm_loadu_pd
(
s_dbl
);
tmp2
=
_mm256_broadcast_pd
(
&
tmp_s_128
);
tmp1
=
_mm256_mul_pd
(
h2_imag
,
tmp2
);
...
...
@@ -621,10 +527,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
h2_real
=
_mm256_broadcast_sd
(
&
s_dbl
[
0
]);
h2_imag
=
_mm256_broadcast_sd
(
&
s_dbl
[
1
]);
//y1 = y1*h1 +x1*h2;
//y2 = y2*h1 +x2*h2;
//y3 = y3*h1 +x3*h2;
//y4 = y4*h1 +x4*h2;
tmp1
=
_mm256_mul_pd
(
h1_imag
,
y1
);
y1
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
y1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
));
tmp2
=
_mm256_mul_pd
(
h1_imag
,
y2
);
...
...
@@ -632,7 +534,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
tmp3
=
_mm256_mul_pd
(
h1_imag
,
y3
);
y3
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
y3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
));
// y1+=x1*h2
tmp1
=
_mm256_mul_pd
(
h2_imag
,
x1
);
y1
=
_mm256_add_pd
(
y1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
tmp2
=
_mm256_mul_pd
(
h2_imag
,
x2
);
...
...
@@ -644,10 +545,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
q2
=
_mm256_load_pd
(
&
q_dbl
[
4
]);
q3
=
_mm256_load_pd
(
&
q_dbl
[
8
]);
//q[0] += y1;
//q[1] += y2;
//q[2] += y3;
//q[3] += y4;
q1
=
_mm256_add_pd
(
q1
,
y1
);
q2
=
_mm256_add_pd
(
q2
,
y2
);
q3
=
_mm256_add_pd
(
q3
,
y3
);
...
...
@@ -656,7 +553,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
_mm256_store_pd
(
&
q_dbl
[
4
],
q2
);
_mm256_store_pd
(
&
q_dbl
[
8
],
q3
);
//h2 = hh[ldh+1];
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
+
1
)
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
...
...
@@ -664,10 +560,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
q2
=
_mm256_load_pd
(
&
q_dbl
[(
ldq
*
2
)
+
4
]);
q3
=
_mm256_load_pd
(
&
q_dbl
[(
ldq
*
2
)
+
8
]);
//q[ldq+0] += (x1 + (y1*h2));
//q[ldq+1] += (x2 + (y2*h2));
//q[ldq+2] += (x3 + (y3*h2));
//q[ldq+3] += (x4 + (y4*h2));
q1
=
_mm256_add_pd
(
q1
,
x1
);
q2
=
_mm256_add_pd
(
q2
,
x2
);
q3
=
_mm256_add_pd
(
q3
,
x3
);
...
...
@@ -685,15 +577,10 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
//q[(i*ldq)+0] += ((x1*h1) + (y1*h2));
//q[(i*ldq)+1] += ((x2*h1) + (y2*h2));
//q[(i*ldq)+2] += ((x3*h1) + (y3*h2));
//q[(i*ldq)+3] += ((x4*h1) + (y4*h2));
q1
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
0
]);
q2
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
4
]);
q3
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
8
]);
//h1 = hh[i-1];
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
i
-
1
)
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
i
-
1
)
*
2
)
+
1
]);
...
...
@@ -704,7 +591,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
tmp3
=
_mm256_mul_pd
(
h1_imag
,
x3
);
q3
=
_mm256_add_pd
(
q3
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
//h2 = hh[ldh+i];
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
+
i
)
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
ldh
+
i
)
*
2
)
+
1
]);
...
...
@@ -719,11 +605,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<dou
_mm256_store_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
4
],
q2
);
_mm256_store_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
8
],
q3
);
}
//h1 = hh[nb-1];
//q[(nb*ldq)+0] += (x1*h1);
//q[(nb*ldq)+1] += (x2*h1);
//q[(nb*ldq)+2] += (x3*h1);
//q[(nb*ldq)+3] += (x4*h1);
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
nb
-
1
)
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
nb
-
1
)
*
2
)
+
1
]);
...
...
@@ -758,23 +639,14 @@ extern "C" __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<dou
__m256d
sign
=
(
__m256d
)
_mm256_set_epi64x
(
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
);
//x1 = q[ldq+0];
//x2 = q[ldq+1];
//x3 = q[ldq+2];
//x4 = q[ldq+3];
x1
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
ldq
)
+
0
]);
x2
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
ldq
)
+
4
]);
//h2 = conj(hh[ldh+1]);
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
+
1
)
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
// conjugate
h2_imag
=
_mm256_xor_pd
(
h2_imag
,
sign
);
//y1 = q[0] + (x1*h2);
//y2 = q[1] + (x2*h2);
//y3 = q[2] + (x3*h2);
//y4 = q[3] + (x4*h2);
y1
=
_mm256_load_pd
(
&
q_dbl
[
0
]);
y2
=
_mm256_load_pd
(
&
q_dbl
[
4
]);
...
...
@@ -785,17 +657,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<dou
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
//h1 = conj(hh[i-1]);
//h2 = conj(hh[ldh+i]);
//x1 += (q[(i*ldq)+0] * h1);
//y1 += (q[(i*ldq)+0] * h2);
//x2 += (q[(i*ldq)+1] * h1);
//y2 += (q[(i*ldq)+1] * h2);
//x3 += (q[(i*ldq)+2] * h1);
//y3 += (q[(i*ldq)+2] * h2);
//x4 += (q[(i*ldq)+3] * h1);
//y4 += (q[(i*ldq)+3] * h2);
q1
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
0
]);
q2
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
4
]);
...
...
@@ -820,11 +681,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<dou
y2
=
_mm256_add_pd
(
y2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
q2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
}
//h1 = conj(hh[nb-1]);
//x1 += (q[(nb*ldq)+0] * h1);
//x2 += (q[(nb*ldq)+1] * h1);
//x3 += (q[(nb*ldq)+2] * h1);
//x4 += (q[(nb*ldq)+3] * h1);
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
nb
-
1
)
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
nb
-
1
)
*
2
)
+
1
]);
// conjugate
...
...
@@ -838,25 +694,16 @@ extern "C" __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<dou
tmp2
=
_mm256_mul_pd
(
h1_imag
,
q2
);
x2
=
_mm256_add_pd
(
x2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
//tau1 = hh[0];
//h1 = (-1.0)*tau1;
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[
0
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[
1
]);
h1_real
=
_mm256_xor_pd
(
h1_real
,
sign
);
h1_imag
=
_mm256_xor_pd
(
h1_imag
,
sign
);
//x1 *= h1;
//x2 *= h1;
//x3 *= h1;
//x4 *= h1;
tmp1
=
_mm256_mul_pd
(
h1_imag
,
x1
);
x1
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
));
tmp2
=
_mm256_mul_pd
(
h1_imag
,
x2
);
x2
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
));
//tau2 = hh[ldh];
//h1 = (-1.0)*tau2;
//h2 = (-1.0)*tau2;
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[
ldh
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
*
2
)
+
1
]);
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[
ldh
*
2
]);
...
...
@@ -867,7 +714,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<dou
h2_real
=
_mm256_xor_pd
(
h2_real
,
sign
);
h2_imag
=
_mm256_xor_pd
(
h2_imag
,
sign
);
//h2 *= s;
__m128d
tmp_s_128
=
_mm_loadu_pd
(
s_dbl
);
tmp2
=
_mm256_broadcast_pd
(
&
tmp_s_128
);
tmp1
=
_mm256_mul_pd
(
h2_imag
,
tmp2
);
...
...
@@ -876,16 +722,11 @@ extern "C" __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<dou
h2_real
=
_mm256_broadcast_sd
(
&
s_dbl
[
0
]);
h2_imag
=
_mm256_broadcast_sd
(
&
s_dbl
[
1
]);
//y1 = y1*h1 +x1*h2;
//y2 = y2*h1 +x2*h2;
//y3 = y3*h1 +x3*h2;
//y4 = y4*h1 +x4*h2;
tmp1
=
_mm256_mul_pd
(
h1_imag
,
y1
);
y1
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
y1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
));
tmp2
=
_mm256_mul_pd
(
h1_imag
,
y2
);
y2
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
y2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
));
// y1+=x1*h2
tmp1
=
_mm256_mul_pd
(
h2_imag
,
x1
);
y1
=
_mm256_add_pd
(
y1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
tmp2
=
_mm256_mul_pd
(
h2_imag
,
x2
);
...
...
@@ -894,27 +735,18 @@ extern "C" __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<dou
q1
=
_mm256_load_pd
(
&
q_dbl
[
0
]);
q2
=
_mm256_load_pd
(
&
q_dbl
[
4
]);
//q[0] += y1;
//q[1] += y2;
//q[2] += y3;
//q[3] += y4;
q1
=
_mm256_add_pd
(
q1
,
y1
);
q2
=
_mm256_add_pd
(
q2
,
y2
);
_mm256_store_pd
(
&
q_dbl
[
0
],
q1
);
_mm256_store_pd
(
&
q_dbl
[
4
],
q2
);
//h2 = hh[ldh+1];
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
+
1
)
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
q1
=
_mm256_load_pd
(
&
q_dbl
[(
ldq
*
2
)
+
0
]);
q2
=
_mm256_load_pd
(
&
q_dbl
[(
ldq
*
2
)
+
4
]);
//q[ldq+0] += (x1 + (y1*h2));
//q[ldq+1] += (x2 + (y2*h2));
//q[ldq+2] += (x3 + (y3*h2));
//q[ldq+3] += (x4 + (y4*h2));
q1
=
_mm256_add_pd
(
q1
,
x1
);
q2
=
_mm256_add_pd
(
q2
,
x2
);
...
...
@@ -928,14 +760,9 @@ extern "C" __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<dou
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
//q[(i*ldq)+0] += ((x1*h1) + (y1*h2));
//q[(i*ldq)+1] += ((x2*h1) + (y2*h2));
//q[(i*ldq)+2] += ((x3*h1) + (y3*h2));
//q[(i*ldq)+3] += ((x4*h1) + (y4*h2));
q1
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
0
]);
q2
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
4
]);
//h1 = hh[i-1];
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
i
-
1
)
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
i
-
1
)
*
2
)
+
1
]);
...
...
@@ -944,7 +771,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<dou
tmp2
=
_mm256_mul_pd
(
h1_imag
,
x2
);
q2
=
_mm256_add_pd
(
q2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
//h2 = hh[ldh+i];
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
+
i
)
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
ldh
+
i
)
*
2
)
+
1
]);
...
...
@@ -956,11 +782,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<dou
_mm256_store_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
0
],
q1
);
_mm256_store_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
4
],
q2
);
}
//h1 = hh[nb-1];
//q[(nb*ldq)+0] += (x1*h1);
//q[(nb*ldq)+1] += (x2*h1);
//q[(nb*ldq)+2] += (x3*h1);
//q[(nb*ldq)+3] += (x4*h1);
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
nb
-
1
)
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
nb
-
1
)
*
2
)
+
1
]);
...
...
@@ -1052,7 +873,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<dou
h2_real
=
_mm256_xor_pd
(
h2_real
,
sign
);
h2_imag
=
_mm256_xor_pd
(
h2_imag
,
sign
);
//h2 *= s;
__m128d
tmp_s_128
=
_mm_loadu_pd
(
s_dbl
);
__m256d
tmp2
=
_mm256_broadcast_pd
(
&
tmp_s_128
);
tmp1
=
_mm256_mul_pd
(
h2_imag
,
tmp2
);
...
...
@@ -1073,7 +893,6 @@ extern "C" __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<dou
_mm256_store_pd
(
&
q_dbl
[
0
],
q1
);
//h2 = hh[ldh+1];
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
+
1
)
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
...
...
@@ -1090,14 +909,12 @@ extern "C" __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<dou
{
q1
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
0
]);
//h1 = hh[i-1];
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
i
-
1
)
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
i
-
1
)
*
2
)
+
1
]);
tmp1
=
_mm256_mul_pd
(
h1_imag
,
x1
);
q1
=
_mm256_add_pd
(
q1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
//h2 = hh[ldh+i];
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
+
i
)
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
ldh
+
i
)
*
2
)
+
1
]);
...
...
@@ -1132,25 +949,16 @@ extern "C" __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<dou
__m128d
sign
=
(
__m128d
)
_mm_set_epi64x
(
0x8000000000000000
,
0x8000000000000000
);
//x1 = q[ldq+0];
//x2 = q[ldq+1];
//x3 = q[ldq+2];
//x4 = q[ldq+3];
x1
=
_mm_load_pd
(
&
q_dbl
[(
2
*
ldq
)
+
0
]);
x2
=
_mm_load_pd
(
&
q_dbl
[(
2
*
ldq
)
+
2
]);
x3
=
_mm_load_pd
(
&
q_dbl
[(
2
*
ldq
)
+
4
]);
x4
=
_mm_load_pd
(
&
q_dbl
[(
2
*
ldq
)
+
6
]);