Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
d081b04a
Commit
d081b04a
authored
Jan 04, 2013
by
Alexander Heinecke
Browse files
added FMA4 for one routine of 2hv complex kernels
parent
0c4f9edb
Changes
1
Hide whitespace changes
Inline
Side-by-side
ELPA_2011.12.Intrinsics/src/elpa2_kernels/elpa2_tum_kernels_complex_sse-avx_2hv.cpp
View file @
d081b04a
...
...
@@ -149,6 +149,16 @@ extern "C" void double_hh_trafo_complex_(std::complex<double>* q, std::complex<d
}
#ifdef __AVX__
#if 1
for
(
i
=
0
;
i
<
nq
-
4
;
i
+=
8
)
{
hh_trafo_complex_kernel_8_AVX_2hv
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
}
if
(
nq
-
i
>
0
)
{
hh_trafo_complex_kernel_4_AVX_2hv
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
}
#else
for
(
i
=
0
;
i
<
nq
-
4
;
i
+=
6
)
{
hh_trafo_complex_kernel_6_AVX_2hv
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
...
...
@@ -161,6 +171,7 @@ extern "C" void double_hh_trafo_complex_(std::complex<double>* q, std::complex<d
{
hh_trafo_complex_kernel_2_AVX_2hv
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
}
#endif
#else
for
(
i
=
0
;
i
<
nq
;
i
+=
4
)
{
...
...
@@ -192,8 +203,10 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
+
1
)
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
#ifndef __FMA4__
// conjugate
h2_imag
=
_mm256_xor_pd
(
h2_imag
,
sign
);
#endif
y1
=
_mm256_load_pd
(
&
q_dbl
[
0
]);
y2
=
_mm256_load_pd
(
&
q_dbl
[
4
]);
...
...
@@ -201,13 +214,29 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
y4
=
_mm256_load_pd
(
&
q_dbl
[
12
]);
tmp1
=
_mm256_mul_pd
(
h2_imag
,
x1
);
#ifdef __FMA4__
y1
=
_mm256_add_pd
(
y1
,
_mm256_msubadd_pd
(
h2_real
,
x1
,
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
#else
y1
=
_mm256_add_pd
(
y1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
#endif
tmp2
=
_mm256_mul_pd
(
h2_imag
,
x2
);
#ifdef __FMA4__
y2
=
_mm256_add_pd
(
y2
,
_mm256_msubadd_pd
(
h2_real
,
x2
,
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
#else
y2
=
_mm256_add_pd
(
y2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
#endif
tmp3
=
_mm256_mul_pd
(
h2_imag
,
x3
);
#ifdef __FMA4__
y3
=
_mm256_add_pd
(
y3
,
_mm256_msubadd_pd
(
h2_real
,
x3
,
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
#else
y3
=
_mm256_add_pd
(
y3
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
#endif
tmp4
=
_mm256_mul_pd
(
h2_imag
,
x4
);
#ifdef __FMA4__
y4
=
_mm256_add_pd
(
y4
,
_mm256_msubadd_pd
(
h2_real
,
x4
,
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
#else
y4
=
_mm256_add_pd
(
y4
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
#endif
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
...
...
@@ -218,37 +247,75 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
i
-
1
)
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
i
-
1
)
*
2
)
+
1
]);
#ifndef __FMA4__
// conjugate
h1_imag
=
_mm256_xor_pd
(
h1_imag
,
sign
);
#endif
tmp1
=
_mm256_mul_pd
(
h1_imag
,
q1
);
#ifdef __FMA4__
x1
=
_mm256_add_pd
(
x1
,
_mm256_msubadd_pd
(
h1_real
,
q1
,
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
#else
x1
=
_mm256_add_pd
(
x1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
#endif
tmp2
=
_mm256_mul_pd
(
h1_imag
,
q2
);
#ifdef __FMA4__
x2
=
_mm256_add_pd
(
x2
,
_mm256_msubadd_pd
(
h1_real
,
q2
,
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
#else
x2
=
_mm256_add_pd
(
x2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
#endif
tmp3
=
_mm256_mul_pd
(
h1_imag
,
q3
);
#ifdef __FMA4__
x3
=
_mm256_add_pd
(
x3
,
_mm256_msubadd_pd
(
h1_real
,
q3
,
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
#else
x3
=
_mm256_add_pd
(
x3
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
#endif
tmp4
=
_mm256_mul_pd
(
h1_imag
,
q4
);
#ifdef __FMA4__
x4
=
_mm256_add_pd
(
x4
,
_mm256_msubadd_pd
(
h1_real
,
q4
,
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
#else
x4
=
_mm256_add_pd
(
x4
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
#endif
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
+
i
)
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
ldh
+
i
)
*
2
)
+
1
]);
#ifndef __FMA4__
// conjugate
h2_imag
=
_mm256_xor_pd
(
h2_imag
,
sign
);
#endif
tmp1
=
_mm256_mul_pd
(
h2_imag
,
q1
);
#ifdef __FMA4__
y1
=
_mm256_add_pd
(
y1
,
_mm256_msubadd_pd
(
h2_real
,
q1
,
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
#else
y1
=
_mm256_add_pd
(
y1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
q1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
#endif
tmp2
=
_mm256_mul_pd
(
h2_imag
,
q2
);
#ifdef __FMA4__
y2
=
_mm256_add_pd
(
y2
,
_mm256_msubadd_pd
(
h2_real
,
q2
,
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
#else
y2
=
_mm256_add_pd
(
y2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
q2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
#endif
tmp3
=
_mm256_mul_pd
(
h2_imag
,
q3
);
#ifdef __FMA4__
y3
=
_mm256_add_pd
(
y3
,
_mm256_msubadd_pd
(
h2_real
,
q3
,
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
#else
y3
=
_mm256_add_pd
(
y3
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
q3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
#endif
tmp4
=
_mm256_mul_pd
(
h2_imag
,
q4
);
#ifdef __FMA4__
y4
=
_mm256_add_pd
(
y4
,
_mm256_msubadd_pd
(
h2_real
,
q4
,
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
#else
y4
=
_mm256_add_pd
(
y4
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
q4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
#endif
}
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
nb
-
1
)
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
nb
-
1
)
*
2
)
+
1
]);
#ifndef __FMA4__
// conjugate
h1_imag
=
_mm256_xor_pd
(
h1_imag
,
sign
);
#endif
q1
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
0
]);
q2
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
4
]);
...
...
@@ -256,13 +323,29 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
q4
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
12
]);
tmp1
=
_mm256_mul_pd
(
h1_imag
,
q1
);
#ifdef __FMA4__
x1
=
_mm256_add_pd
(
x1
,
_mm256_msubadd_pd
(
h1_real
,
q1
,
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
#else
x1
=
_mm256_add_pd
(
x1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
#endif
tmp2
=
_mm256_mul_pd
(
h1_imag
,
q2
);
#ifdef __FMA4__
x2
=
_mm256_add_pd
(
x2
,
_mm256_msubadd_pd
(
h1_real
,
q2
,
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
#else
x2
=
_mm256_add_pd
(
x2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
#endif
tmp3
=
_mm256_mul_pd
(
h1_imag
,
q3
);
#ifdef __FMA4__
x3
=
_mm256_add_pd
(
x3
,
_mm256_msubadd_pd
(
h1_real
,
q3
,
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
#else
x3
=
_mm256_add_pd
(
x3
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
#endif
tmp4
=
_mm256_mul_pd
(
h1_imag
,
q4
);
#ifdef __FMA4__
x4
=
_mm256_add_pd
(
x4
,
_mm256_msubadd_pd
(
h1_real
,
q4
,
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
#else
x4
=
_mm256_add_pd
(
x4
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
q4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
#endif
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[
0
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[
1
]);
...
...
@@ -270,13 +353,29 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
h1_imag
=
_mm256_xor_pd
(
h1_imag
,
sign
);
tmp1
=
_mm256_mul_pd
(
h1_imag
,
x1
);
#ifdef __FMA4__
x1
=
_mm256_maddsub_pd
(
h1_real
,
x1
,
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
));
#else
x1
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
));
#endif
tmp2
=
_mm256_mul_pd
(
h1_imag
,
x2
);
#ifdef __FMA4__
x2
=
_mm256_maddsub_pd
(
h1_real
,
x2
,
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
));
#else
x2
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
));
#endif
tmp3
=
_mm256_mul_pd
(
h1_imag
,
x3
);
#ifdef __FMA4__
x3
=
_mm256_maddsub_pd
(
h1_real
,
x3
,
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
));
#else
x3
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
));
#endif
tmp4
=
_mm256_mul_pd
(
h1_imag
,
x4
);
#ifdef __FMA4__
x4
=
_mm256_maddsub_pd
(
h1_real
,
x4
,
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
));
#else
x4
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
));
#endif
h1_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[
ldh
*
2
]);
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
*
2
)
+
1
]);
...
...
@@ -291,28 +390,64 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
__m128d
tmp_s_128
=
_mm_loadu_pd
(
s_dbl
);
tmp2
=
_mm256_broadcast_pd
(
&
tmp_s_128
);
tmp1
=
_mm256_mul_pd
(
h2_imag
,
tmp2
);
#ifdef __FMA4__
tmp2
=
_mm256_maddsub_pd
(
h2_real
,
tmp2
,
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
));
#else
tmp2
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
tmp2
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
));
#endif
_mm_storeu_pd
(
s_dbl
,
_mm256_castpd256_pd128
(
tmp2
));
h2_real
=
_mm256_broadcast_sd
(
&
s_dbl
[
0
]);
h2_imag
=
_mm256_broadcast_sd
(
&
s_dbl
[
1
]);
tmp1
=
_mm256_mul_pd
(
h1_imag
,
y1
);
#ifdef __FMA4__
y1
=
_mm256_maddsub_pd
(
h1_real
,
y1
,
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
));
#else
y1
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
y1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
));
#endif
tmp2
=
_mm256_mul_pd
(
h1_imag
,
y2
);
#ifdef __FMA4__
y2
=
_mm256_maddsub_pd
(
h1_real
,
y2
,
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
));
#else
y2
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
y2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
));
#endif
tmp3
=
_mm256_mul_pd
(
h1_imag
,
y3
);
#ifdef __FMA4__
y3
=
_mm256_maddsub_pd
(
h1_real
,
y3
,
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
));
#else
y3
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
y3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
));
#endif
tmp4
=
_mm256_mul_pd
(
h1_imag
,
y4
);
#ifdef __FMA4__
y4
=
_mm256_maddsub_pd
(
h1_real
,
y4
,
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
));
#else
y4
=
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
y4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
));
#endif
tmp1
=
_mm256_mul_pd
(
h2_imag
,
x1
);
#ifdef __FMA4__
y1
=
_mm256_add_pd
(
y1
,
_mm256_maddsub_pd
(
h2_real
,
x1
,
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
#else
y1
=
_mm256_add_pd
(
y1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
#endif
tmp2
=
_mm256_mul_pd
(
h2_imag
,
x2
);
#ifdef __FMA4__
y2
=
_mm256_add_pd
(
y2
,
_mm256_maddsub_pd
(
h2_real
,
x2
,
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
#else
y2
=
_mm256_add_pd
(
y2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
#endif
tmp3
=
_mm256_mul_pd
(
h2_imag
,
x3
);
#ifdef __FMA4__
y3
=
_mm256_add_pd
(
y3
,
_mm256_maddsub_pd
(
h2_real
,
x3
,
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
#else
y3
=
_mm256_add_pd
(
y3
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
#endif
tmp4
=
_mm256_mul_pd
(
h2_imag
,
x4
);
#ifdef __FMA4__
y4
=
_mm256_add_pd
(
y4
,
_mm256_maddsub_pd
(
h2_real
,
x4
,
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
#else
y4
=
_mm256_add_pd
(
y4
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
x4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
#endif
q1
=
_mm256_load_pd
(
&
q_dbl
[
0
]);
q2
=
_mm256_load_pd
(
&
q_dbl
[
4
]);
...
...
@@ -343,13 +478,29 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
q4
=
_mm256_add_pd
(
q4
,
x4
);
tmp1
=
_mm256_mul_pd
(
h2_imag
,
y1
);
#ifdef __FMA4__
q1
=
_mm256_add_pd
(
q1
,
_mm256_maddsub_pd
(
h2_real
,
y1
,
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
#else
q1
=
_mm256_add_pd
(
q1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
y1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
#endif
tmp2
=
_mm256_mul_pd
(
h2_imag
,
y2
);
#ifdef __FMA4_
q2
=
_mm256_add_pd
(
q2
,
_mm256_maddsub_pd
(
h2_real
,
y2
,
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
#else
q2
=
_mm256_add_pd
(
q2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
y2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
#endif
tmp3
=
_mm256_mul_pd
(
h2_imag
,
y3
);
#ifdef __FMA4__
q3
=
_mm256_add_pd
(
q3
,
_mm256_maddsub_pd
(
h2_real
,
y3
,
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
#else
q3
=
_mm256_add_pd
(
q3
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
y3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
#endif
tmp4
=
_mm256_mul_pd
(
h2_imag
,
y4
);
#ifdef __FMA4__
q4
=
_mm256_add_pd
(
q4
,
_mm256_maddsub_pd
(
h2_real
,
y4
,
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
#else
q4
=
_mm256_add_pd
(
q4
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
y4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
#endif
_mm256_store_pd
(
&
q_dbl
[(
ldq
*
2
)
+
0
],
q1
);
_mm256_store_pd
(
&
q_dbl
[(
ldq
*
2
)
+
4
],
q2
);
...
...
@@ -367,25 +518,57 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
h1_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
i
-
1
)
*
2
)
+
1
]);
tmp1
=
_mm256_mul_pd
(
h1_imag
,
x1
);
#ifdef __FMA4__
q1
=
_mm256_add_pd
(
q1
,
_mm256_maddsub_pd
(
h1_real
,
x1
,
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
#else
q1
=
_mm256_add_pd
(
q1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
#endif
tmp2
=
_mm256_mul_pd
(
h1_imag
,
x2
);
#ifdef __FMA4__
q2
=
_mm256_add_pd
(
q2
,
_mm256_maddsub_pd
(
h1_real
,
x2
,
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
#else
q2
=
_mm256_add_pd
(
q2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
#endif
tmp3
=
_mm256_mul_pd
(
h1_imag
,
x3
);
#ifdef __FMA4__
q3
=
_mm256_add_pd
(
q3
,
_mm256_maddsub_pd
(
h1_real
,
x3
,
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
#else
q3
=
_mm256_add_pd
(
q3
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
#endif
tmp4
=
_mm256_mul_pd
(
h1_imag
,
x4
);
#ifdef __FMA4__
q4
=
_mm256_add_pd
(
q4
,
_mm256_maddsub_pd
(
h1_real
,
x4
,
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
#else
q4
=
_mm256_add_pd
(
q4
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
#endif
h2_real
=
_mm256_broadcast_sd
(
&
hh_dbl
[(
ldh
+
i
)
*
2
]);
h2_imag
=
_mm256_broadcast_sd
(
&
hh_dbl
[((
ldh
+
i
)
*
2
)
+
1
]);
tmp1
=
_mm256_mul_pd
(
h2_imag
,
y1
);
#ifdef __FMA4__
q1
=
_mm256_add_pd
(
q1
,
_mm256_maddsub_pd
(
h2_real
,
y1
,
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
#else
q1
=
_mm256_add_pd
(
q1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
y1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
#endif
tmp2
=
_mm256_mul_pd
(
h2_imag
,
y2
);
#ifdef __FMA4__
q2
=
_mm256_add_pd
(
q2
,
_mm256_maddsub_pd
(
h2_real
,
y2
,
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
#else
q2
=
_mm256_add_pd
(
q2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
y2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
#endif
tmp3
=
_mm256_mul_pd
(
h2_imag
,
y3
);
#ifdef __FMA4__
q3
=
_mm256_add_pd
(
q3
,
_mm256_maddsub_pd
(
h2_real
,
y3
,
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
#else
q3
=
_mm256_add_pd
(
q3
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
y3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
#endif
tmp4
=
_mm256_mul_pd
(
h2_imag
,
y4
);
#ifdef __FMA4__
q4
=
_mm256_add_pd
(
q4
,
_mm256_maddsub_pd
(
h2_real
,
y4
,
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
#else
q4
=
_mm256_add_pd
(
q4
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h2_real
,
y4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
#endif
_mm256_store_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
0
],
q1
);
_mm256_store_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
4
],
q2
);
...
...
@@ -401,13 +584,29 @@ extern "C" __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<dou
q4
=
_mm256_load_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
12
]);
tmp1
=
_mm256_mul_pd
(
h1_imag
,
x1
);
#ifdef __FMA4__
q1
=
_mm256_add_pd
(
q1
,
_mm256_maddsub_pd
(
h1_real
,
x1
,
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
#else
q1
=
_mm256_add_pd
(
q1
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x1
),
_mm256_shuffle_pd
(
tmp1
,
tmp1
,
0x5
)));
#endif
tmp2
=
_mm256_mul_pd
(
h1_imag
,
x2
);
#ifdef __FMA4__
q2
=
_mm256_add_pd
(
q2
,
_mm256_maddsub_pd
(
h1_real
,
x2
,
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
#else
q2
=
_mm256_add_pd
(
q2
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x2
),
_mm256_shuffle_pd
(
tmp2
,
tmp2
,
0x5
)));
#endif
tmp3
=
_mm256_mul_pd
(
h1_imag
,
x3
);
#ifdef __FMA4__
q3
=
_mm256_add_pd
(
q3
,
_mm256_maddsub_pd
(
h1_real
,
x3
,
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
#else
q3
=
_mm256_add_pd
(
q3
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x3
),
_mm256_shuffle_pd
(
tmp3
,
tmp3
,
0x5
)));
#endif
tmp4
=
_mm256_mul_pd
(
h1_imag
,
x4
);
#ifdef __FMA4__
q4
=
_mm256_add_pd
(
q4
,
_mm256_maddsub_pd
(
h1_real
,
x4
,
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
#else
q4
=
_mm256_add_pd
(
q4
,
_mm256_addsub_pd
(
_mm256_mul_pd
(
h1_real
,
x4
),
_mm256_shuffle_pd
(
tmp4
,
tmp4
,
0x5
)));
#endif
_mm256_store_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
0
],
q1
);
_mm256_store_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
4
],
q2
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment