Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
953a4ed6
Commit
953a4ed6
authored
Jan 03, 2013
by
Alexander Heinecke
Browse files
working SSE and AVX intrinsic version of complex 2hv
parent
da10d57e
Changes
6
Hide whitespace changes
Inline
Side-by-side
ELPA_2011.12.Intrinsics/src/elpa2_kernels/elpa2_tum_kernels_complex_sse-avx_1hv.cpp
View file @
953a4ed6
...
...
@@ -11,6 +11,7 @@
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// --------------------------------------------------------------------------------------------------
#include
<complex>
...
...
ELPA_2011.12.Intrinsics/src/elpa2_kernels/elpa2_tum_kernels_complex_sse-avx_2hv.cpp
View file @
953a4ed6
...
...
@@ -11,6 +11,7 @@
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// --------------------------------------------------------------------------------------------------
#include
<complex>
...
...
@@ -24,38 +25,13 @@
//Forward declaration
#ifdef __AVX__
extern
"C"
__forceinline
void
hh_trafo_complex_kernel_8_AVX_2hv
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
);
extern
"C"
__forceinline
void
hh_trafo_complex_kernel_4_AVX_2hv
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
);
#else
extern
"C"
__forceinline
void
hh_trafo_complex_kernel_4_SSE_2hv
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
);
#endif
extern
"C"
void
double_hh_trafo_complex_
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
)
{
int
i
;
int
nb
=
*
pnb
;
int
nq
=
*
pldq
;
int
ldq
=
*
pldq
;
int
ldh
=
*
pldh
;
std
::
complex
<
double
>
s
=
conj
(
hh
[(
ldh
)
+
1
])
*
1.0
;
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
s
+=
hh
[
i
-
1
]
*
conj
(
hh
[(
i
+
ldh
)]);
}
#ifdef __AVX__
for
(
i
=
0
;
i
<
nq
;
i
+=
4
)
{
hh_trafo_complex_kernel_4_AVX_2hv
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
}
#else
for
(
i
=
0
;
i
<
nq
;
i
+=
4
)
{
hh_trafo_complex_kernel_4_SSE_2hv
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
}
#endif
}
#if 0
extern "C" __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
std::complex<double> x1;
...
...
@@ -151,15 +127,879 @@ extern "C" __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex<doubl
q[(nb*ldq)+2] += (x3*h1);
q[(nb*ldq)+3] += (x4*h1);
}
#endif
extern
"C"
void
double_hh_trafo_complex_
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
)
{
int
i
;
int
nb
=
*
pnb
;
int
nq
=
*
pldq
;
int
ldq
=
*
pldq
;
int
ldh
=
*
pldh
;
std
::
complex
<
double
>
s
=
conj
(
hh
[(
ldh
)
+
1
])
*
1.0
;
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
s
+=
hh
[
i
-
1
]
*
conj
(
hh
[(
i
+
ldh
)]);
}
#ifdef __AVX__
for
(
i
=
0
;
i
<
nq
-
4
;
i
+=
8
)
{
hh_trafo_complex_kernel_8_AVX_2hv
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
}
if
(
i
<
nq
)
{
hh_trafo_complex_kernel_4_AVX_2hv
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
}
#else
for
(
i
=
0
;
i
<
nq
;
i
+=
4
)
{
hh_trafo_complex_kernel_4_SSE_2hv
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
}
#endif
}
#ifdef __AVX__
extern
"C"
__forceinline
void
hh_trafo_complex_kernel_8_AVX_2hv
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
)
{
double
*
q_dbl
=
(
double
*
)
q
;
double
*
hh_dbl
=
(
double
*
)
hh
;
double
*
s_dbl
=
(
double
*
)(
&
s
);
__m256d
x1
,
x2
,
x3
,
x4
;
__m256d
y1
,
y2
,
y3
,
y4
;
__m256d
q1
,
q2
,
q3
,
q4
;
__m256d
h1_real
,
h1_imag
,
h2_real
,
h2_imag
;
__m256d
tmp1
,
tmp2
,
tmp3
,
tmp4
;
int
i
=
0
;
__m256d
sign
=
(
__m256d
)
_mm256_set_epi64x
(
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
);
//x1 = q[ldq+0];
//x2 = q[ldq+1];
//x3 = q[ldq+2];
//x4 = q[ldq+3];
x1
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
ldq
)
+
0
]);
x2
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
ldq
)
+
4
]);
x3
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
ldq
)
+
8
]);
x4
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
ldq
)
+
12
]);
//h2 = conj(hh[ldh+1]);
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
+
1
)
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
// conjugate
h2_imag
=
_mm256_xor_pd
(
h2_imag
,
sign
);
//y1 = q[0] + (x1*h2);
//y2 = q[1] + (x2*h2);
//y3 = q[2] + (x3*h2);
//y4 = q[3] + (x4*h2);
y1
=
_mm256_load_pd
(
&
q_dbl
[
0
]);
y2
=
_mm256_load_pd
(
&
q_dbl
[
4
]);
y3
=
_mm256_load_pd
(
&
q_dbl
[
8
]);
y4
=
_mm256_load_pd
(
&
q_dbl
[
12
]);
tmp1
=
_mm256_mul_pd
(
h2_imag
,
x1
);
y1
=
_mm256_add_pd
(
y1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
tmp2
=
_mm256_mul_pd
(
h2_imag
,
x2
);
y2
=
_mm256_add_pd
(
y2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
tmp3
=
_mm256_mul_pd
(
h2_imag
,
x3
);
y3
=
_mm256_add_pd
(
y3
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
tmp4
=
_mm256_mul_pd
(
h2_imag
,
x4
);
y4
=
_mm256_add_pd
(
y4
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
//h1 = conj(hh[i-1]);
//h2 = conj(hh[ldh+i]);
//x1 += (q[(i*ldq)+0] * h1);
//y1 += (q[(i*ldq)+0] * h2);
//x2 += (q[(i*ldq)+1] * h1);
//y2 += (q[(i*ldq)+1] * h2);
//x3 += (q[(i*ldq)+2] * h1);
//y3 += (q[(i*ldq)+2] * h2);
//x4 += (q[(i*ldq)+3] * h1);
//y4 += (q[(i*ldq)+3] * h2);
q1
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
0
]);
q2
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
4
]);
q3
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
8
]);
q4
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
12
]);
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
i
-
1
)
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
i
-
1
)
*
2
)
+
1
]);
// conjugate
h1_imag
=
_mm256_xor_pd
(
h1_imag
,
sign
);
tmp1
=
_mm256_mul_pd
(
h1_imag
,
q1
);
x1
=
_mm256_add_pd
(
x1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
tmp2
=
_mm256_mul_pd
(
h1_imag
,
q2
);
x2
=
_mm256_add_pd
(
x2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
tmp3
=
_mm256_mul_pd
(
h1_imag
,
q3
);
x3
=
_mm256_add_pd
(
x3
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
tmp4
=
_mm256_mul_pd
(
h1_imag
,
q4
);
x4
=
_mm256_add_pd
(
x4
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
+
i
)
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
ldh
+
i
)
*
2
)
+
1
]);
// conjugate
h2_imag
=
_mm256_xor_pd
(
h2_imag
,
sign
);
tmp1
=
_mm256_mul_pd
(
h2_imag
,
q1
);
y1
=
_mm256_add_pd
(
y1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
q1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
tmp2
=
_mm256_mul_pd
(
h2_imag
,
q2
);
y2
=
_mm256_add_pd
(
y2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
q2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
tmp3
=
_mm256_mul_pd
(
h2_imag
,
q3
);
y3
=
_mm256_add_pd
(
y3
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
q3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
tmp4
=
_mm256_mul_pd
(
h2_imag
,
q4
);
y4
=
_mm256_add_pd
(
y4
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
q4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
}
//h1 = conj(hh[nb-1]);
//x1 += (q[(nb*ldq)+0] * h1);
//x2 += (q[(nb*ldq)+1] * h1);
//x3 += (q[(nb*ldq)+2] * h1);
//x4 += (q[(nb*ldq)+3] * h1);
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
nb
-
1
)
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
nb
-
1
)
*
2
)
+
1
]);
// conjugate
h1_imag
=
_mm256_xor_pd
(
h1_imag
,
sign
);
q1
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
0
]);
q2
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
4
]);
q3
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
8
]);
q4
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
12
]);
tmp1
=
_mm256_mul_pd
(
h1_imag
,
q1
);
x1
=
_mm256_add_pd
(
x1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
tmp2
=
_mm256_mul_pd
(
h1_imag
,
q2
);
x2
=
_mm256_add_pd
(
x2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
tmp3
=
_mm256_mul_pd
(
h1_imag
,
q3
);
x3
=
_mm256_add_pd
(
x3
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
tmp4
=
_mm256_mul_pd
(
h1_imag
,
q4
);
x4
=
_mm256_add_pd
(
x4
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
//tau1 = hh[0];
//h1 = (-1.0)*tau1;
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[
0
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[
1
]);
h1_real
=
_mm256_xor_pd
(
h1_real
,
sign
);
h1_imag
=
_mm256_xor_pd
(
h1_imag
,
sign
);
//x1 *= h1;
//x2 *= h1;
//x3 *= h1;
//x4 *= h1;
tmp1
=
_mm256_mul_pd
(
h1_imag
,
x1
);
x1
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
));
tmp2
=
_mm256_mul_pd
(
h1_imag
,
x2
);
x2
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
));
tmp3
=
_mm256_mul_pd
(
h1_imag
,
x3
);
x3
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
));
tmp4
=
_mm256_mul_pd
(
h1_imag
,
x4
);
x4
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
));
//tau2 = hh[ldh];
//h1 = (-1.0)*tau2;
//h2 = (-1.0)*tau2;
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[
ldh
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
*
2
)
+
1
]);
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[
ldh
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
*
2
)
+
1
]);
h1_real
=
_mm256_xor_pd
(
h1_real
,
sign
);
h1_imag
=
_mm256_xor_pd
(
h1_imag
,
sign
);
h2_real
=
_mm256_xor_pd
(
h2_real
,
sign
);
h2_imag
=
_mm256_xor_pd
(
h2_imag
,
sign
);
//h2 *= s;
__m128d
tmp_s_128
=
_mm_loadu_pd
(
s_dbl
);
tmp2
=
_mm256_broadcast_pd
(
&
tmp_s_128
);
tmp1
=
_mm256_mul_pd
(
h2_imag
,
tmp2
);
tmp2
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
tmp2
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
));
_mm_storeu_pd
(
s_dbl
,
_mm256_castpd256_pd128
(
tmp2
));
h2_real
=
_mm256_broadcast_sd
(
&
s_dbl
[
0
]);
h2_imag
=
_mm256_broadcast_sd
(
&
s_dbl
[
1
]);
//y1 = y1*h1 +x1*h2;
//y2 = y2*h1 +x2*h2;
//y3 = y3*h1 +x3*h2;
//y4 = y4*h1 +x4*h2;
tmp1
=
_mm256_mul_pd
(
h1_imag
,
y1
);
y1
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
y1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
));
tmp2
=
_mm256_mul_pd
(
h1_imag
,
y2
);
y2
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
y2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
));
tmp3
=
_mm256_mul_pd
(
h1_imag
,
y3
);
y3
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
y3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
));
tmp4
=
_mm256_mul_pd
(
h1_imag
,
y4
);
y4
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
y4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
));
// y1+=x1*h2
tmp1
=
_mm256_mul_pd
(
h2_imag
,
x1
);
y1
=
_mm256_add_pd
(
y1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
tmp2
=
_mm256_mul_pd
(
h2_imag
,
x2
);
y2
=
_mm256_add_pd
(
y2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
tmp3
=
_mm256_mul_pd
(
h2_imag
,
x3
);
y3
=
_mm256_add_pd
(
y3
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
tmp4
=
_mm256_mul_pd
(
h2_imag
,
x4
);
y4
=
_mm256_add_pd
(
y4
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
q1
=
_mm256_load_pd
(
&
q_dbl
[
0
]);
q2
=
_mm256_load_pd
(
&
q_dbl
[
4
]);
q3
=
_mm256_load_pd
(
&
q_dbl
[
8
]);
q4
=
_mm256_load_pd
(
&
q_dbl
[
12
]);
//q[0] += y1;
//q[1] += y2;
//q[2] += y3;
//q[3] += y4;
q1
=
_mm256_add_pd
(
q1
,
y1
);
q2
=
_mm256_add_pd
(
q2
,
y2
);
q3
=
_mm256_add_pd
(
q3
,
y3
);
q4
=
_mm256_add_pd
(
q4
,
y4
);
_mm256_store_pd
(
&
q_dbl
[
0
],
q1
);
_mm256_store_pd
(
&
q_dbl
[
4
],
q2
);
_mm256_store_pd
(
&
q_dbl
[
8
],
q3
);
_mm256_store_pd
(
&
q_dbl
[
12
],
q4
);
//h2 = hh[ldh+1];
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
+
1
)
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
q1
=
_mm256_load_pd
(
&
q_dbl
[(
ldq
*
2
)
+
0
]);
q2
=
_mm256_load_pd
(
&
q_dbl
[(
ldq
*
2
)
+
4
]);
q3
=
_mm256_load_pd
(
&
q_dbl
[(
ldq
*
2
)
+
8
]);
q4
=
_mm256_load_pd
(
&
q_dbl
[(
ldq
*
2
)
+
12
]);
//q[ldq+0] += (x1 + (y1*h2));
//q[ldq+1] += (x2 + (y2*h2));
//q[ldq+2] += (x3 + (y3*h2));
//q[ldq+3] += (x4 + (y4*h2));
q1
=
_mm256_add_pd
(
q1
,
x1
);
q2
=
_mm256_add_pd
(
q2
,
x2
);
q3
=
_mm256_add_pd
(
q3
,
x3
);
q4
=
_mm256_add_pd
(
q4
,
x4
);
tmp1
=
_mm256_mul_pd
(
h2_imag
,
y1
);
q1
=
_mm256_add_pd
(
q1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
y1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
tmp2
=
_mm256_mul_pd
(
h2_imag
,
y2
);
q2
=
_mm256_add_pd
(
q2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
y2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
tmp3
=
_mm256_mul_pd
(
h2_imag
,
y3
);
q3
=
_mm256_add_pd
(
q3
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
y3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
tmp4
=
_mm256_mul_pd
(
h2_imag
,
y4
);
q4
=
_mm256_add_pd
(
q4
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
y4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
_mm256_store_pd
(
&
q_dbl
[(
ldq
*
2
)
+
0
],
q1
);
_mm256_store_pd
(
&
q_dbl
[(
ldq
*
2
)
+
4
],
q2
);
_mm256_store_pd
(
&
q_dbl
[(
ldq
*
2
)
+
8
],
q3
);
_mm256_store_pd
(
&
q_dbl
[(
ldq
*
2
)
+
12
],
q4
);
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
//q[(i*ldq)+0] += ((x1*h1) + (y1*h2));
//q[(i*ldq)+1] += ((x2*h1) + (y2*h2));
//q[(i*ldq)+2] += ((x3*h1) + (y3*h2));
//q[(i*ldq)+3] += ((x4*h1) + (y4*h2));
q1
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
0
]);
q2
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
4
]);
q3
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
8
]);
q4
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
12
]);
//h1 = hh[i-1];
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
i
-
1
)
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
i
-
1
)
*
2
)
+
1
]);
tmp1
=
_mm256_mul_pd
(
h1_imag
,
x1
);
q1
=
_mm256_add_pd
(
q1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
tmp2
=
_mm256_mul_pd
(
h1_imag
,
x2
);
q2
=
_mm256_add_pd
(
q2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
tmp3
=
_mm256_mul_pd
(
h1_imag
,
x3
);
q3
=
_mm256_add_pd
(
q3
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
tmp4
=
_mm256_mul_pd
(
h1_imag
,
x4
);
q4
=
_mm256_add_pd
(
q4
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
//h2 = hh[ldh+i];
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
+
i
)
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
ldh
+
i
)
*
2
)
+
1
]);
tmp1
=
_mm256_mul_pd
(
h2_imag
,
y1
);
q1
=
_mm256_add_pd
(
q1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
y1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
tmp2
=
_mm256_mul_pd
(
h2_imag
,
y2
);
q2
=
_mm256_add_pd
(
q2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
y2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
tmp3
=
_mm256_mul_pd
(
h2_imag
,
y3
);
q3
=
_mm256_add_pd
(
q3
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
y3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
tmp4
=
_mm256_mul_pd
(
h2_imag
,
y4
);
q4
=
_mm256_add_pd
(
q4
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
y4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
_mm256_store_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
0
],
q1
);
_mm256_store_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
4
],
q2
);
_mm256_store_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
8
],
q3
);
_mm256_store_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
12
],
q4
);
}
//h1 = hh[nb-1];
//q[(nb*ldq)+0] += (x1*h1);
//q[(nb*ldq)+1] += (x2*h1);
//q[(nb*ldq)+2] += (x3*h1);
//q[(nb*ldq)+3] += (x4*h1);
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
nb
-
1
)
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
nb
-
1
)
*
2
)
+
1
]);
q1
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
0
]);
q2
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
4
]);
q3
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
8
]);
q4
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
12
]);
tmp1
=
_mm256_mul_pd
(
h1_imag
,
x1
);
q1
=
_mm256_add_pd
(
q1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
tmp2
=
_mm256_mul_pd
(
h1_imag
,
x2
);
q2
=
_mm256_add_pd
(
q2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
tmp3
=
_mm256_mul_pd
(
h1_imag
,
x3
);
q3
=
_mm256_add_pd
(
q3
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
tmp4
=
_mm256_mul_pd
(
h1_imag
,
x4
);
q4
=
_mm256_add_pd
(
q4
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
_mm256_store_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
0
],
q1
);
_mm256_store_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
4
],
q2
);
_mm256_store_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
8
],
q3
);
_mm256_store_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
12
],
q4
);
}
extern
"C"
__forceinline
void
hh_trafo_complex_kernel_4_AVX_2hv
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
)
{
hh_trafo_complex_kernel_4_C_2hv
(
q
,
hh
,
nb
,
ldq
,
ldh
,
s
);
double
*
q_dbl
=
(
double
*
)
q
;
double
*
hh_dbl
=
(
double
*
)
hh
;
double
*
s_dbl
=
(
double
*
)(
&
s
);
__m256d
x1
,
x2
;
__m256d
y1
,
y2
;
__m256d
q1
,
q2
;
__m256d
h1_real
,
h1_imag
,
h2_real
,
h2_imag
;
__m256d
tmp1
,
tmp2
;
int
i
=
0
;
__m256d
sign
=
(
__m256d
)
_mm256_set_epi64x
(
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
);
//x1 = q[ldq+0];
//x2 = q[ldq+1];
//x3 = q[ldq+2];
//x4 = q[ldq+3];
x1
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
ldq
)
+
0
]);
x2
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
ldq
)
+
4
]);
//h2 = conj(hh[ldh+1]);
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
+
1
)
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
// conjugate
h2_imag
=
_mm256_xor_pd
(
h2_imag
,
sign
);
//y1 = q[0] + (x1*h2);
//y2 = q[1] + (x2*h2);
//y3 = q[2] + (x3*h2);
//y4 = q[3] + (x4*h2);
y1
=
_mm256_load_pd
(
&
q_dbl
[
0
]);
y2
=
_mm256_load_pd
(
&
q_dbl
[
4
]);
tmp1
=
_mm256_mul_pd
(
h2_imag
,
x1
);
y1
=
_mm256_add_pd
(
y1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
tmp2
=
_mm256_mul_pd
(
h2_imag
,
x2
);
y2
=
_mm256_add_pd
(
y2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
//h1 = conj(hh[i-1]);
//h2 = conj(hh[ldh+i]);
//x1 += (q[(i*ldq)+0] * h1);
//y1 += (q[(i*ldq)+0] * h2);
//x2 += (q[(i*ldq)+1] * h1);
//y2 += (q[(i*ldq)+1] * h2);
//x3 += (q[(i*ldq)+2] * h1);
//y3 += (q[(i*ldq)+2] * h2);
//x4 += (q[(i*ldq)+3] * h1);
//y4 += (q[(i*ldq)+3] * h2);
q1
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
0
]);
q2
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
4
]);
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
i
-
1
)
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
i
-
1
)
*
2
)
+
1
]);
// conjugate
h1_imag
=
_mm256_xor_pd
(
h1_imag
,
sign
);
tmp1
=
_mm256_mul_pd
(
h1_imag
,
q1
);
x1
=
_mm256_add_pd
(
x1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
tmp2
=
_mm256_mul_pd
(
h1_imag
,
q2
);
x2
=
_mm256_add_pd
(
x2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
+
i
)
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
ldh
+
i
)
*
2
)
+
1
]);
// conjugate
h2_imag
=
_mm256_xor_pd
(
h2_imag
,
sign
);
tmp1
=
_mm256_mul_pd
(
h2_imag
,
q1
);
y1
=
_mm256_add_pd
(
y1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
q1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
tmp2
=
_mm256_mul_pd
(
h2_imag
,
q2
);
y2
=
_mm256_add_pd
(
y2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
q2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
}
//h1 = conj(hh[nb-1]);
//x1 += (q[(nb*ldq)+0] * h1);
//x2 += (q[(nb*ldq)+1] * h1);
//x3 += (q[(nb*ldq)+2] * h1);
//x4 += (q[(nb*ldq)+3] * h1);
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
nb
-
1
)
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
nb
-
1
)
*
2
)
+
1
]);
// conjugate
h1_imag
=
_mm256_xor_pd
(
h1_imag
,
sign
);
q1
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
0
]);
q2
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
4
]);
tmp1
=
_mm256_mul_pd
(
h1_imag
,
q1
);
x1
=
_mm256_add_pd
(
x1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
tmp2
=
_mm256_mul_pd
(
h1_imag
,
q2
);
x2
=
_mm256_add_pd
(
x2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
//tau1 = hh[0];
//h1 = (-1.0)*tau1;
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[
0
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[
1
]);
h1_real
=
_mm256_xor_pd
(
h1_real
,
sign
);
h1_imag
=
_mm256_xor_pd
(
h1_imag
,
sign
);
//x1 *= h1;
//x2 *= h1;
//x3 *= h1;
//x4 *= h1;
tmp1
=
_mm256_mul_pd
(
h1_imag
,
x1
);
x1
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
));
tmp2
=
_mm256_mul_pd
(
h1_imag
,
x2
);
x2
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
));
//tau2 = hh[ldh];
//h1 = (-1.0)*tau2;
//h2 = (-1.0)*tau2;
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[
ldh
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
*
2
)
+
1
]);
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[
ldh
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
*
2
)
+
1
]);
h1_real
=
_mm256_xor_pd
(
h1_real
,
sign
);
h1_imag
=
_mm256_xor_pd
(
h1_imag
,
sign
);
h2_real
=
_mm256_xor_pd
(
h2_real
,
sign
);
h2_imag
=
_mm256_xor_pd
(
h2_imag
,
sign
);
//h2 *= s;
__m128d
tmp_s_128
=
_mm_loadu_pd
(
s_dbl
);
tmp2
=
_mm256_broadcast_pd
(
&
tmp_s_128
);
tmp1
=
_mm256_mul_pd
(
h2_imag
,
tmp2
);
tmp2
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
tmp2
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
));
_mm_storeu_pd
(
s_dbl
,
_mm256_castpd256_pd128
(
tmp2
));
h2_real
=
_mm256_broadcast_sd
(
&
s_dbl
[
0
]);
h2_imag
=
_mm256_broadcast_sd
(
&
s_dbl
[
1
]);
//y1 = y1*h1 +x1*h2;
//y2 = y2*h1 +x2*h2;
//y3 = y3*h1 +x3*h2;
//y4 = y4*h1 +x4*h2;
tmp1
=
_mm256_mul_pd
(
h1_imag
,
y1
);
y1
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
y1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
));
tmp2
=
_mm256_mul_pd
(
h1_imag
,
y2
);
y2
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
y2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
));
// y1+=x1*h2
tmp1
=
_mm256_mul_pd
(
h2_imag
,
x1
);
y1
=
_mm256_add_pd
(
y1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x1
),
_mm256_shuffle_pd
(
tmp1