Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
5093e483
Unverified
Commit
5093e483
authored
Apr 08, 2016
by
Andreas Marek
Browse files
Remove FMA references in SSE kernels, the are not used anyway
parent
24867b0e
Changes
3
Hide whitespace changes
Inline
Side-by-side
src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c
View file @
5093e483
...
...
@@ -181,20 +181,6 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
__m128d
h1
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
__m128d
h2
;
#ifdef __ELPA_USE_FMA__
__m128d
q1
=
_mm_load_pd
(
q
);
__m128d
y1
=
_mm_macc_pd
(
x1
,
h1
,
q1
);
__m128d
q2
=
_mm_load_pd
(
&
q
[
2
]);
__m128d
y2
=
_mm_macc_pd
(
x2
,
h1
,
q2
);
__m128d
q3
=
_mm_load_pd
(
&
q
[
4
]);
__m128d
y3
=
_mm_macc_pd
(
x3
,
h1
,
q3
);
__m128d
q4
=
_mm_load_pd
(
&
q
[
6
]);
__m128d
y4
=
_mm_macc_pd
(
x4
,
h1
,
q4
);
__m128d
q5
=
_mm_load_pd
(
&
q
[
8
]);
__m128d
y5
=
_mm_macc_pd
(
x5
,
h1
,
q5
);
__m128d
q6
=
_mm_load_pd
(
&
q
[
10
]);
__m128d
y6
=
_mm_macc_pd
(
x6
,
h1
,
q6
);
#else
__m128d
q1
=
_mm_load_pd
(
q
);
__m128d
y1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
__m128d
q2
=
_mm_load_pd
(
&
q
[
2
]);
...
...
@@ -207,31 +193,12 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
__m128d
y5
=
_mm_add_pd
(
q5
,
_mm_mul_pd
(
x5
,
h1
));
__m128d
q6
=
_mm_load_pd
(
&
q
[
10
]);
__m128d
y6
=
_mm_add_pd
(
q6
,
_mm_mul_pd
(
x6
,
h1
));
#endif
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
x1
=
_mm_macc_pd
(
q1
,
h1
,
x1
);
y1
=
_mm_macc_pd
(
q1
,
h2
,
y1
);
q2
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
2
]);
x2
=
_mm_macc_pd
(
q2
,
h1
,
x2
);
y2
=
_mm_macc_pd
(
q2
,
h2
,
y2
);
q3
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
4
]);
x3
=
_mm_macc_pd
(
q3
,
h1
,
x3
);
y3
=
_mm_macc_pd
(
q3
,
h2
,
y3
);
q4
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
6
]);
x4
=
_mm_macc_pd
(
q4
,
h1
,
x4
);
y4
=
_mm_macc_pd
(
q4
,
h2
,
y4
);
q5
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
8
]);
x5
=
_mm_macc_pd
(
q5
,
h1
,
x5
);
y5
=
_mm_macc_pd
(
q5
,
h2
,
y5
);
q6
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
10
]);
x6
=
_mm_macc_pd
(
q6
,
h1
,
x6
);
y6
=
_mm_macc_pd
(
q6
,
h2
,
y6
);
#else
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
y1
=
_mm_add_pd
(
y1
,
_mm_mul_pd
(
q1
,
h2
));
...
...
@@ -250,24 +217,10 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
q6
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
10
]);
x6
=
_mm_add_pd
(
x6
,
_mm_mul_pd
(
q6
,
h1
));
y6
=
_mm_add_pd
(
y6
,
_mm_mul_pd
(
q6
,
h2
));
#endif
}
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
x1
=
_mm_macc_pd
(
q1
,
h1
,
x1
);
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
x2
=
_mm_macc_pd
(
q2
,
h1
,
x2
);
q3
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
4
]);
x3
=
_mm_macc_pd
(
q3
,
h1
,
x3
);
q4
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
6
]);
x4
=
_mm_macc_pd
(
q4
,
h1
,
x4
);
q5
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
8
]);
x5
=
_mm_macc_pd
(
q5
,
h1
,
x5
);
q6
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
10
]);
x6
=
_mm_macc_pd
(
q6
,
h1
,
x6
);
#else
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
...
...
@@ -280,7 +233,6 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
x5
=
_mm_add_pd
(
x5
,
_mm_mul_pd
(
q5
,
h1
));
q6
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
10
]);
x6
=
_mm_add_pd
(
x6
,
_mm_mul_pd
(
q6
,
h1
));
#endif
/////////////////////////////////////////////////////
// Rank-2 update of Q [12 x nb+1]
...
...
@@ -299,21 +251,13 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
x6
=
_mm_mul_pd
(
x6
,
h1
);
h1
=
_mm_xor_pd
(
tau2
,
sign
);
h2
=
_mm_mul_pd
(
h1
,
vs
);
#ifdef __ELPA_USE_FMA__
y1
=
_mm_macc_pd
(
y1
,
h1
,
_mm_mul_pd
(
x1
,
h2
));
y2
=
_mm_macc_pd
(
y2
,
h1
,
_mm_mul_pd
(
x2
,
h2
));
y3
=
_mm_macc_pd
(
y3
,
h1
,
_mm_mul_pd
(
x3
,
h2
));
y4
=
_mm_macc_pd
(
y4
,
h1
,
_mm_mul_pd
(
x4
,
h2
));
y5
=
_mm_macc_pd
(
y5
,
h1
,
_mm_mul_pd
(
x5
,
h2
));
y6
=
_mm_macc_pd
(
y6
,
h1
,
_mm_mul_pd
(
x6
,
h2
));
#else
y1
=
_mm_add_pd
(
_mm_mul_pd
(
y1
,
h1
),
_mm_mul_pd
(
x1
,
h2
));
y2
=
_mm_add_pd
(
_mm_mul_pd
(
y2
,
h1
),
_mm_mul_pd
(
x2
,
h2
));
y3
=
_mm_add_pd
(
_mm_mul_pd
(
y3
,
h1
),
_mm_mul_pd
(
x3
,
h2
));
y4
=
_mm_add_pd
(
_mm_mul_pd
(
y4
,
h1
),
_mm_mul_pd
(
x4
,
h2
));
y5
=
_mm_add_pd
(
_mm_mul_pd
(
y5
,
h1
),
_mm_mul_pd
(
x5
,
h2
));
y6
=
_mm_add_pd
(
_mm_mul_pd
(
y6
,
h1
),
_mm_mul_pd
(
x6
,
h2
));
#endif
q1
=
_mm_load_pd
(
q
);
q1
=
_mm_add_pd
(
q1
,
y1
);
...
...
@@ -335,26 +279,7 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
_mm_store_pd
(
&
q
[
10
],
q6
);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_macc_pd
(
y1
,
h2
,
x1
));
_mm_store_pd
(
&
q
[
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[
ldq
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_macc_pd
(
y2
,
h2
,
x2
));
_mm_store_pd
(
&
q
[
ldq
+
2
],
q2
);
q3
=
_mm_load_pd
(
&
q
[
ldq
+
4
]);
q3
=
_mm_add_pd
(
q3
,
_mm_macc_pd
(
y3
,
h2
,
x3
));
_mm_store_pd
(
&
q
[
ldq
+
4
],
q3
);
q4
=
_mm_load_pd
(
&
q
[
ldq
+
6
]);
q4
=
_mm_add_pd
(
q4
,
_mm_macc_pd
(
y4
,
h2
,
x4
));
_mm_store_pd
(
&
q
[
ldq
+
6
],
q4
);
q5
=
_mm_load_pd
(
&
q
[
ldq
+
8
]);
q5
=
_mm_add_pd
(
q5
,
_mm_macc_pd
(
y5
,
h2
,
x5
));
_mm_store_pd
(
&
q
[
ldq
+
8
],
q5
);
q6
=
_mm_load_pd
(
&
q
[
ldq
+
10
]);
q6
=
_mm_add_pd
(
q6
,
_mm_macc_pd
(
y6
,
h2
,
x6
));
_mm_store_pd
(
&
q
[
ldq
+
10
],
q6
);
#else
q1
=
_mm_load_pd
(
&
q
[
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_add_pd
(
x1
,
_mm_mul_pd
(
y1
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
],
q1
);
...
...
@@ -373,32 +298,12 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
q6
=
_mm_load_pd
(
&
q
[
ldq
+
10
]);
q6
=
_mm_add_pd
(
q6
,
_mm_add_pd
(
x6
,
_mm_mul_pd
(
y6
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
+
10
],
q6
);
#endif
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_macc_pd
(
x1
,
h1
,
_mm_mul_pd
(
y1
,
h2
)));
_mm_store_pd
(
&
q
[
i
*
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_macc_pd
(
x2
,
h1
,
_mm_mul_pd
(
y2
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
2
],
q2
);
q3
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
4
]);
q3
=
_mm_add_pd
(
q3
,
_mm_macc_pd
(
x3
,
h1
,
_mm_mul_pd
(
y3
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
4
],
q3
);
q4
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
6
]);
q4
=
_mm_add_pd
(
q4
,
_mm_macc_pd
(
x4
,
h1
,
_mm_mul_pd
(
y4
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
6
],
q4
);
q5
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
8
]);
q5
=
_mm_add_pd
(
q5
,
_mm_macc_pd
(
x5
,
h1
,
_mm_mul_pd
(
y5
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
8
],
q5
);
q6
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
10
]);
q6
=
_mm_add_pd
(
q6
,
_mm_macc_pd
(
x6
,
h1
,
_mm_mul_pd
(
y6
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
10
],
q6
);
#else
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_add_pd
(
_mm_mul_pd
(
x1
,
h1
),
_mm_mul_pd
(
y1
,
h2
)));
_mm_store_pd
(
&
q
[
i
*
ldq
],
q1
);
...
...
@@ -417,30 +322,10 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
q6
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
10
]);
q6
=
_mm_add_pd
(
q6
,
_mm_add_pd
(
_mm_mul_pd
(
x6
,
h1
),
_mm_mul_pd
(
y6
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
10
],
q6
);
#endif
}
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
q1
=
_mm_macc_pd
(
x1
,
h1
,
q1
);
_mm_store_pd
(
&
q
[
nb
*
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
q2
=
_mm_macc_pd
(
x2
,
h1
,
q2
);
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
2
],
q2
);
q3
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
4
]);
q3
=
_mm_macc_pd
(
x3
,
h1
,
q3
);
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
4
],
q3
);
q4
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
6
]);
q4
=
_mm_macc_pd
(
x4
,
h1
,
q4
);
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
6
],
q4
);
q5
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
8
]);
q5
=
_mm_macc_pd
(
x5
,
h1
,
q5
);
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
8
],
q5
);
q6
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
10
]);
q6
=
_mm_macc_pd
(
x6
,
h1
,
q6
);
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
10
],
q6
);
#else
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
_mm_store_pd
(
&
q
[
nb
*
ldq
],
q1
);
...
...
@@ -459,7 +344,6 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
q6
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
10
]);
q6
=
_mm_add_pd
(
q6
,
_mm_mul_pd
(
x6
,
h1
));
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
10
],
q6
);
#endif
}
/**
...
...
@@ -487,16 +371,6 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
__m128d
h1
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
__m128d
h2
;
#ifdef __ELPA_USE_FMA__
__m128d
q1
=
_mm_load_pd
(
q
);
__m128d
y1
=
_mm_macc_pd
(
x1
,
h1
,
q1
);
__m128d
q2
=
_mm_load_pd
(
&
q
[
2
]);
__m128d
y2
=
_mm_macc_pd
(
x2
,
h1
,
q2
);
__m128d
q3
=
_mm_load_pd
(
&
q
[
4
]);
__m128d
y3
=
_mm_macc_pd
(
x3
,
h1
,
q3
);
__m128d
q4
=
_mm_load_pd
(
&
q
[
6
]);
__m128d
y4
=
_mm_macc_pd
(
x4
,
h1
,
q4
);
#else
__m128d
q1
=
_mm_load_pd
(
q
);
__m128d
y1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
__m128d
q2
=
_mm_load_pd
(
&
q
[
2
]);
...
...
@@ -505,26 +379,12 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
__m128d
y3
=
_mm_add_pd
(
q3
,
_mm_mul_pd
(
x3
,
h1
));
__m128d
q4
=
_mm_load_pd
(
&
q
[
6
]);
__m128d
y4
=
_mm_add_pd
(
q4
,
_mm_mul_pd
(
x4
,
h1
));
#endif
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
x1
=
_mm_macc_pd
(
q1
,
h1
,
x1
);
y1
=
_mm_macc_pd
(
q1
,
h2
,
y1
);
q2
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
2
]);
x2
=
_mm_macc_pd
(
q2
,
h1
,
x2
);
y2
=
_mm_macc_pd
(
q2
,
h2
,
y2
);
q3
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
4
]);
x3
=
_mm_macc_pd
(
q3
,
h1
,
x3
);
y3
=
_mm_macc_pd
(
q3
,
h2
,
y3
);
q4
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
6
]);
x4
=
_mm_macc_pd
(
q4
,
h1
,
x4
);
y4
=
_mm_macc_pd
(
q4
,
h2
,
y4
);
#else
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
y1
=
_mm_add_pd
(
y1
,
_mm_mul_pd
(
q1
,
h2
));
...
...
@@ -537,20 +397,10 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
q4
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
6
]);
x4
=
_mm_add_pd
(
x4
,
_mm_mul_pd
(
q4
,
h1
));
y4
=
_mm_add_pd
(
y4
,
_mm_mul_pd
(
q4
,
h2
));
#endif
}
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
x1
=
_mm_macc_pd
(
q1
,
h1
,
x1
);
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
x2
=
_mm_macc_pd
(
q2
,
h1
,
x2
);
q3
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
4
]);
x3
=
_mm_macc_pd
(
q3
,
h1
,
x3
);
q4
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
6
]);
x4
=
_mm_macc_pd
(
q4
,
h1
,
x4
);
#else
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
...
...
@@ -559,7 +409,6 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
x3
=
_mm_add_pd
(
x3
,
_mm_mul_pd
(
q3
,
h1
));
q4
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
6
]);
x4
=
_mm_add_pd
(
x4
,
_mm_mul_pd
(
q4
,
h1
));
#endif
/////////////////////////////////////////////////////
// Rank-2 update of Q [8 x nb+1]
...
...
@@ -576,17 +425,11 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
x4
=
_mm_mul_pd
(
x4
,
h1
);
h1
=
_mm_xor_pd
(
tau2
,
sign
);
h2
=
_mm_mul_pd
(
h1
,
vs
);
#ifdef __ELPA_USE_FMA__
y1
=
_mm_macc_pd
(
y1
,
h1
,
_mm_mul_pd
(
x1
,
h2
));
y2
=
_mm_macc_pd
(
y2
,
h1
,
_mm_mul_pd
(
x2
,
h2
));
y3
=
_mm_macc_pd
(
y3
,
h1
,
_mm_mul_pd
(
x3
,
h2
));
y4
=
_mm_macc_pd
(
y4
,
h1
,
_mm_mul_pd
(
x4
,
h2
));
#else
y1
=
_mm_add_pd
(
_mm_mul_pd
(
y1
,
h1
),
_mm_mul_pd
(
x1
,
h2
));
y2
=
_mm_add_pd
(
_mm_mul_pd
(
y2
,
h1
),
_mm_mul_pd
(
x2
,
h2
));
y3
=
_mm_add_pd
(
_mm_mul_pd
(
y3
,
h1
),
_mm_mul_pd
(
x3
,
h2
));
y4
=
_mm_add_pd
(
_mm_mul_pd
(
y4
,
h1
),
_mm_mul_pd
(
x4
,
h2
));
#endif
q1
=
_mm_load_pd
(
q
);
q1
=
_mm_add_pd
(
q1
,
y1
);
...
...
@@ -602,20 +445,7 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
_mm_store_pd
(
&
q
[
6
],
q4
);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_macc_pd
(
y1
,
h2
,
x1
));
_mm_store_pd
(
&
q
[
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[
ldq
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_macc_pd
(
y2
,
h2
,
x2
));
_mm_store_pd
(
&
q
[
ldq
+
2
],
q2
);
q3
=
_mm_load_pd
(
&
q
[
ldq
+
4
]);
q3
=
_mm_add_pd
(
q3
,
_mm_macc_pd
(
y3
,
h2
,
x3
));
_mm_store_pd
(
&
q
[
ldq
+
4
],
q3
);
q4
=
_mm_load_pd
(
&
q
[
ldq
+
6
]);
q4
=
_mm_add_pd
(
q4
,
_mm_macc_pd
(
y4
,
h2
,
x4
));
_mm_store_pd
(
&
q
[
ldq
+
6
],
q4
);
#else
q1
=
_mm_load_pd
(
&
q
[
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_add_pd
(
x1
,
_mm_mul_pd
(
y1
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
],
q1
);
...
...
@@ -628,27 +458,12 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
q4
=
_mm_load_pd
(
&
q
[
ldq
+
6
]);
q4
=
_mm_add_pd
(
q4
,
_mm_add_pd
(
x4
,
_mm_mul_pd
(
y4
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
+
6
],
q4
);
#endif
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_macc_pd
(
x1
,
h1
,
_mm_mul_pd
(
y1
,
h2
)));
_mm_store_pd
(
&
q
[
i
*
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_macc_pd
(
x2
,
h1
,
_mm_mul_pd
(
y2
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
2
],
q2
);
q3
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
4
]);
q3
=
_mm_add_pd
(
q3
,
_mm_macc_pd
(
x3
,
h1
,
_mm_mul_pd
(
y3
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
4
],
q3
);
q4
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
6
]);
q4
=
_mm_add_pd
(
q4
,
_mm_macc_pd
(
x4
,
h1
,
_mm_mul_pd
(
y4
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
6
],
q4
);
#else
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_add_pd
(
_mm_mul_pd
(
x1
,
h1
),
_mm_mul_pd
(
y1
,
h2
)));
_mm_store_pd
(
&
q
[
i
*
ldq
],
q1
);
...
...
@@ -661,24 +476,10 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
q4
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
6
]);
q4
=
_mm_add_pd
(
q4
,
_mm_add_pd
(
_mm_mul_pd
(
x4
,
h1
),
_mm_mul_pd
(
y4
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
6
],
q4
);
#endif
}
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
q1
=
_mm_macc_pd
(
x1
,
h1
,
q1
);
_mm_store_pd
(
&
q
[
nb
*
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
q2
=
_mm_macc_pd
(
x2
,
h1
,
q2
);
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
2
],
q2
);
q3
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
4
]);
q3
=
_mm_macc_pd
(
x3
,
h1
,
q3
);
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
4
],
q3
);
q4
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
6
]);
q4
=
_mm_macc_pd
(
x4
,
h1
,
q4
);
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
6
],
q4
);
#else
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
_mm_store_pd
(
&
q
[
nb
*
ldq
],
q1
);
...
...
@@ -691,7 +492,6 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
q4
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
6
]);
q4
=
_mm_add_pd
(
q4
,
_mm_mul_pd
(
x4
,
h1
));
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
6
],
q4
);
#endif
}
/**
...
...
@@ -717,51 +517,30 @@ __forceinline void hh_trafo_kernel_4_SSE_2hv(double* q, double* hh, int nb, int
__m128d
h1
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
__m128d
h2
;
#ifdef __ELPA_USE_FMA__
__m128d
q1
=
_mm_load_pd
(
q
);
__m128d
y1
=
_mm_macc_pd
(
x1
,
h1
,
q1
);
__m128d
q2
=
_mm_load_pd
(
&
q
[
2
]);
__m128d
y2
=
_mm_macc_pd
(
x2
,
h1
,
q2
);
#else
__m128d
q1
=
_mm_load_pd
(
q
);
__m128d
y1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
__m128d
q2
=
_mm_load_pd
(
&
q
[
2
]);
__m128d
y2
=
_mm_add_pd
(
q2
,
_mm_mul_pd
(
x2
,
h1
));
#endif
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
x1
=
_mm_macc_pd
(
q1
,
h1
,
x1
);
y1
=
_mm_macc_pd
(
q1
,
h2
,
y1
);
q2
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
2
]);
x2
=
_mm_macc_pd
(
q2
,
h1
,
x2
);
y2
=
_mm_macc_pd
(
q2
,
h2
,
y2
);
#else
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
y1
=
_mm_add_pd
(
y1
,
_mm_mul_pd
(
q1
,
h2
));
q2
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
2
]);
x2
=
_mm_add_pd
(
x2
,
_mm_mul_pd
(
q2
,
h1
));
y2
=
_mm_add_pd
(
y2
,
_mm_mul_pd
(
q2
,
h2
));
#endif
}
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
x1
=
_mm_macc_pd
(
q1
,
h1
,
x1
);
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
x2
=
_mm_macc_pd
(
q2
,
h1
,
x2
);
#else
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
x2
=
_mm_add_pd
(
x2
,
_mm_mul_pd
(
q2
,
h1
));
#endif
/////////////////////////////////////////////////////
// Rank-2 update of Q [12 x nb+1]
...
...
@@ -776,13 +555,9 @@ __forceinline void hh_trafo_kernel_4_SSE_2hv(double* q, double* hh, int nb, int
x2
=
_mm_mul_pd
(
x2
,
h1
);
h1
=
_mm_xor_pd
(
tau2
,
sign
);
h2
=
_mm_mul_pd
(
h1
,
vs
);
#ifdef __ELPA_USE_FMA__
y1
=
_mm_macc_pd
(
y1
,
h1
,
_mm_mul_pd
(
x1
,
h2
));
y2
=
_mm_macc_pd
(
y2
,
h1
,
_mm_mul_pd
(
x2
,
h2
));
#else
y1
=
_mm_add_pd
(
_mm_mul_pd
(
y1
,
h1
),
_mm_mul_pd
(
x1
,
h2
));
y2
=
_mm_add_pd
(
_mm_mul_pd
(
y2
,
h1
),
_mm_mul_pd
(
x2
,
h2
));
#endif
q1
=
_mm_load_pd
(
q
);
q1
=
_mm_add_pd
(
q1
,
y1
);
...
...
@@ -792,58 +567,33 @@ __forceinline void hh_trafo_kernel_4_SSE_2hv(double* q, double* hh, int nb, int
_mm_store_pd
(
&
q
[
2
],
q2
);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_macc_pd
(
y1
,
h2
,
x1
));
_mm_store_pd
(
&
q
[
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[
ldq
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_macc_pd
(
y2
,
h2
,
x2
));
_mm_store_pd
(
&
q
[
ldq
+
2
],
q2
);
#else
q1
=
_mm_load_pd
(
&
q
[
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_add_pd
(
x1
,
_mm_mul_pd
(
y1
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[
ldq
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_add_pd
(
x2
,
_mm_mul_pd
(
y2
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
+
2
],
q2
);
#endif
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_macc_pd
(
x1
,
h1
,
_mm_mul_pd
(
y1
,
h2
)));
_mm_store_pd
(
&
q
[
i
*
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_macc_pd
(
x2
,
h1
,
_mm_mul_pd
(
y2
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
2
],
q2
);
#else
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_add_pd
(
_mm_mul_pd
(
x1
,
h1
),
_mm_mul_pd
(
y1
,
h2
)));
_mm_store_pd
(
&
q
[
i
*
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_add_pd
(
_mm_mul_pd
(
x2
,
h1
),
_mm_mul_pd
(
y2
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
2
],
q2
);
#endif
}
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
q1
=
_mm_macc_pd
(
x1
,
h1
,
q1
);
_mm_store_pd
(
&
q
[
nb
*
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
q2
=
_mm_macc_pd
(
x2
,
h1
,
q2
);
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
2
],
q2
);
#else
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
_mm_store_pd
(
&
q
[
nb
*
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_mul_pd
(
x2
,
h1
));
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
2
],
q2
);
#endif
}
src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c
View file @
5093e483
...
...
@@ -238,15 +238,6 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int
__m128d
h_4_2
=
_mm_loaddup_pd
(
&
hh
[(
ldh
*
3
)
+
2
]);
__m128d
h_4_1
=
_mm_loaddup_pd
(
&
hh
[(
ldh
*
3
)
+
3
]);
#ifdef __ELPA_USE_FMA__
register
__m128d
w1
=
_mm_macc_pd
(
a3_1
,
h_4_3
,
a4_1
);
w1
=
_mm_macc_pd
(
a2_1
,
h_4_2
,
w1
);
w1
=
_mm_macc_pd
(
a1_1
,
h_4_1
,
w1
);
register
__m128d
z1
=
_mm_macc_pd
(
a2_1
,
h_3_2
,
a3_1
);
z1
=
_mm_macc_pd
(
a1_1
,
h_3_1
,
z1
);
register
__m128d
y1
=
_mm_macc_pd
(
a1_1
,
h_2_1
,
a2_1
);
register
__m128d
x1
=
a1_1
;
#else
register
__m128d
w1
=
_mm_add_pd
(
a4_1
,
_mm_mul_pd
(
a3_1
,
h_4_3
));
w1
=
_mm_add_pd
(
w1
,
_mm_mul_pd
(
a2_1
,
h_4_2
));
w1
=
_mm_add_pd
(
w1
,
_mm_mul_pd
(
a1_1
,
h_4_1
));
...
...
@@ -254,22 +245,12 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int
z1
=
_mm_add_pd
(
z1
,
_mm_mul_pd
(
a1_1
,
h_3_1
));
register
__m128d
y1
=
_mm_add_pd
(
a2_1
,
_mm_mul_pd
(
a1_1
,
h_2_1
));
register
__m128d
x1
=
a1_1
;
#endif
__m128d
a1_2
=
_mm_load_pd
(
&
q
[(
ldq
*
3
)
+
2
]);
__m128d
a2_2
=
_mm_load_pd
(
&
q
[(
ldq
*
2
)
+
2
]);
__m128d
a3_2
=
_mm_load_pd
(
&
q
[
ldq
+
2
]);
__m128d
a4_2
=
_mm_load_pd
(
&
q
[
0
+
2
]);
#ifdef __ELPA_USE_FMA__
register
__m128d
w2
=
_mm_macc_pd
(
a3_2
,
h_4_3
,
a4_2
);
w2
=
_mm_macc_pd
(
a2_2
,
h_4_2
,
w2
);
w2
=
_mm_macc_pd
(
a1_2
,
h_4_1
,
w2
);
register
__m128d
z2
=
_mm_macc_pd
(
a2_2
,
h_3_2
,
a3_2
);
z2
=
_mm_macc_pd
(
a1_2
,
h_3_1
,
z2
);
register
__m128d
y2
=
_mm_macc_pd
(
a1_2
,
h_2_1
,
a2_2
);
register
__m128d
x2
=
a1_2
;
#else
register
__m128d
w2
=
_mm_add_pd
(
a4_2
,
_mm_mul_pd
(
a3_2
,
h_4_3
));
w2
=
_mm_add_pd
(
w2
,
_mm_mul_pd
(
a2_2
,
h_4_2
));
w2
=
_mm_add_pd
(
w2
,
_mm_mul_pd
(
a1_2
,
h_4_1
));
...
...
@@ -277,22 +258,12 @@ __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int
z2
=
_mm_add_pd
(
z2
,
_mm_mul_pd
(
a1_2
,
h_3_1
));
register
__m128d
y2
=
_mm_add_pd
(
a2_2
,
_mm_mul_pd
(
a1_2
,
h_2_1
));
register
__m128d
x2
=
a1_2
;
#endif
__m128d
a1_3
=
_mm_load_pd
(
&
q
[(
ldq
*
3
)
+
4
]);
__m128d
a2_3
=
_mm_load_pd
(
&
q
[(
ldq
*
2
)
+
4
]);
__m128d
a3_3
=
_mm_load_pd
(
&
q
[
ldq
+
4
]);
__m128d
a4_3
=
_mm_load_pd
(
&
q
[
0
+
4
]);
#ifdef __ELPA_USE_FMA__
register
__m128d
w3
=
_mm_macc_pd
(
a3_3
,
h_4_3
,
a4_3
);
w3
=
_mm_macc_pd
(
a2_3
,
h_4_2
,
w3
);
w3
=
_mm_macc_pd
(
a1_3
,
h_4_1
,
w3
);
register
__m128d
z3
=
_mm_macc_pd
(
a2_3
,
h_3_2
,
a3_3
);
z3
=
_mm_macc_pd
(
a1_3
,
h_3_1
,
z3
);
register
__m128d
y3
=
_mm_macc_pd
(
a1_3
,
h_2_1
,
a2_3
);
register
__m128d
x3
=
a1_3
;
#else
register
__m128d
w3
=
_mm_add_pd
(
a4_3
,
_mm_mul_pd
(
a3_3
,
h_4_3
));
w3
=
_mm_add_pd
(
w3
,
_mm_mul_pd
(
a2_3
,
h_4_2
));
w3
=
_mm_add_pd
(
w3
,
_mm_mul_pd<