Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
5093e483
Unverified
Commit
5093e483
authored
Apr 08, 2016
by
Andreas Marek
Browse files
Remove FMA references in SSE kernels, the are not used anyway
parent
24867b0e
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c
View file @
5093e483
...
@@ -181,20 +181,6 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
...
@@ -181,20 +181,6 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
__m128d
h1
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
__m128d
h1
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
__m128d
h2
;
__m128d
h2
;
#ifdef __ELPA_USE_FMA__
__m128d
q1
=
_mm_load_pd
(
q
);
__m128d
y1
=
_mm_macc_pd
(
x1
,
h1
,
q1
);
__m128d
q2
=
_mm_load_pd
(
&
q
[
2
]);
__m128d
y2
=
_mm_macc_pd
(
x2
,
h1
,
q2
);
__m128d
q3
=
_mm_load_pd
(
&
q
[
4
]);
__m128d
y3
=
_mm_macc_pd
(
x3
,
h1
,
q3
);
__m128d
q4
=
_mm_load_pd
(
&
q
[
6
]);
__m128d
y4
=
_mm_macc_pd
(
x4
,
h1
,
q4
);
__m128d
q5
=
_mm_load_pd
(
&
q
[
8
]);
__m128d
y5
=
_mm_macc_pd
(
x5
,
h1
,
q5
);
__m128d
q6
=
_mm_load_pd
(
&
q
[
10
]);
__m128d
y6
=
_mm_macc_pd
(
x6
,
h1
,
q6
);
#else
__m128d
q1
=
_mm_load_pd
(
q
);
__m128d
q1
=
_mm_load_pd
(
q
);
__m128d
y1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
__m128d
y1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
__m128d
q2
=
_mm_load_pd
(
&
q
[
2
]);
__m128d
q2
=
_mm_load_pd
(
&
q
[
2
]);
...
@@ -207,31 +193,12 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
...
@@ -207,31 +193,12 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
__m128d
y5
=
_mm_add_pd
(
q5
,
_mm_mul_pd
(
x5
,
h1
));
__m128d
y5
=
_mm_add_pd
(
q5
,
_mm_mul_pd
(
x5
,
h1
));
__m128d
q6
=
_mm_load_pd
(
&
q
[
10
]);
__m128d
q6
=
_mm_load_pd
(
&
q
[
10
]);
__m128d
y6
=
_mm_add_pd
(
q6
,
_mm_mul_pd
(
x6
,
h1
));
__m128d
y6
=
_mm_add_pd
(
q6
,
_mm_mul_pd
(
x6
,
h1
));
#endif
for
(
i
=
2
;
i
<
nb
;
i
++
)
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
{
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
x1
=
_mm_macc_pd
(
q1
,
h1
,
x1
);
y1
=
_mm_macc_pd
(
q1
,
h2
,
y1
);
q2
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
2
]);
x2
=
_mm_macc_pd
(
q2
,
h1
,
x2
);
y2
=
_mm_macc_pd
(
q2
,
h2
,
y2
);
q3
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
4
]);
x3
=
_mm_macc_pd
(
q3
,
h1
,
x3
);
y3
=
_mm_macc_pd
(
q3
,
h2
,
y3
);
q4
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
6
]);
x4
=
_mm_macc_pd
(
q4
,
h1
,
x4
);
y4
=
_mm_macc_pd
(
q4
,
h2
,
y4
);
q5
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
8
]);
x5
=
_mm_macc_pd
(
q5
,
h1
,
x5
);
y5
=
_mm_macc_pd
(
q5
,
h2
,
y5
);
q6
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
10
]);
x6
=
_mm_macc_pd
(
q6
,
h1
,
x6
);
y6
=
_mm_macc_pd
(
q6
,
h2
,
y6
);
#else
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
y1
=
_mm_add_pd
(
y1
,
_mm_mul_pd
(
q1
,
h2
));
y1
=
_mm_add_pd
(
y1
,
_mm_mul_pd
(
q1
,
h2
));
...
@@ -250,24 +217,10 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
...
@@ -250,24 +217,10 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
q6
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
10
]);
q6
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
10
]);
x6
=
_mm_add_pd
(
x6
,
_mm_mul_pd
(
q6
,
h1
));
x6
=
_mm_add_pd
(
x6
,
_mm_mul_pd
(
q6
,
h1
));
y6
=
_mm_add_pd
(
y6
,
_mm_mul_pd
(
q6
,
h2
));
y6
=
_mm_add_pd
(
y6
,
_mm_mul_pd
(
q6
,
h2
));
#endif
}
}
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
x1
=
_mm_macc_pd
(
q1
,
h1
,
x1
);
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
x2
=
_mm_macc_pd
(
q2
,
h1
,
x2
);
q3
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
4
]);
x3
=
_mm_macc_pd
(
q3
,
h1
,
x3
);
q4
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
6
]);
x4
=
_mm_macc_pd
(
q4
,
h1
,
x4
);
q5
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
8
]);
x5
=
_mm_macc_pd
(
q5
,
h1
,
x5
);
q6
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
10
]);
x6
=
_mm_macc_pd
(
q6
,
h1
,
x6
);
#else
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
...
@@ -280,7 +233,6 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
...
@@ -280,7 +233,6 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
x5
=
_mm_add_pd
(
x5
,
_mm_mul_pd
(
q5
,
h1
));
x5
=
_mm_add_pd
(
x5
,
_mm_mul_pd
(
q5
,
h1
));
q6
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
10
]);
q6
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
10
]);
x6
=
_mm_add_pd
(
x6
,
_mm_mul_pd
(
q6
,
h1
));
x6
=
_mm_add_pd
(
x6
,
_mm_mul_pd
(
q6
,
h1
));
#endif
/////////////////////////////////////////////////////
/////////////////////////////////////////////////////
// Rank-2 update of Q [12 x nb+1]
// Rank-2 update of Q [12 x nb+1]
...
@@ -299,21 +251,13 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
...
@@ -299,21 +251,13 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
x6
=
_mm_mul_pd
(
x6
,
h1
);
x6
=
_mm_mul_pd
(
x6
,
h1
);
h1
=
_mm_xor_pd
(
tau2
,
sign
);
h1
=
_mm_xor_pd
(
tau2
,
sign
);
h2
=
_mm_mul_pd
(
h1
,
vs
);
h2
=
_mm_mul_pd
(
h1
,
vs
);
#ifdef __ELPA_USE_FMA__
y1
=
_mm_macc_pd
(
y1
,
h1
,
_mm_mul_pd
(
x1
,
h2
));
y2
=
_mm_macc_pd
(
y2
,
h1
,
_mm_mul_pd
(
x2
,
h2
));
y3
=
_mm_macc_pd
(
y3
,
h1
,
_mm_mul_pd
(
x3
,
h2
));
y4
=
_mm_macc_pd
(
y4
,
h1
,
_mm_mul_pd
(
x4
,
h2
));
y5
=
_mm_macc_pd
(
y5
,
h1
,
_mm_mul_pd
(
x5
,
h2
));
y6
=
_mm_macc_pd
(
y6
,
h1
,
_mm_mul_pd
(
x6
,
h2
));
#else
y1
=
_mm_add_pd
(
_mm_mul_pd
(
y1
,
h1
),
_mm_mul_pd
(
x1
,
h2
));
y1
=
_mm_add_pd
(
_mm_mul_pd
(
y1
,
h1
),
_mm_mul_pd
(
x1
,
h2
));
y2
=
_mm_add_pd
(
_mm_mul_pd
(
y2
,
h1
),
_mm_mul_pd
(
x2
,
h2
));
y2
=
_mm_add_pd
(
_mm_mul_pd
(
y2
,
h1
),
_mm_mul_pd
(
x2
,
h2
));
y3
=
_mm_add_pd
(
_mm_mul_pd
(
y3
,
h1
),
_mm_mul_pd
(
x3
,
h2
));
y3
=
_mm_add_pd
(
_mm_mul_pd
(
y3
,
h1
),
_mm_mul_pd
(
x3
,
h2
));
y4
=
_mm_add_pd
(
_mm_mul_pd
(
y4
,
h1
),
_mm_mul_pd
(
x4
,
h2
));
y4
=
_mm_add_pd
(
_mm_mul_pd
(
y4
,
h1
),
_mm_mul_pd
(
x4
,
h2
));
y5
=
_mm_add_pd
(
_mm_mul_pd
(
y5
,
h1
),
_mm_mul_pd
(
x5
,
h2
));
y5
=
_mm_add_pd
(
_mm_mul_pd
(
y5
,
h1
),
_mm_mul_pd
(
x5
,
h2
));
y6
=
_mm_add_pd
(
_mm_mul_pd
(
y6
,
h1
),
_mm_mul_pd
(
x6
,
h2
));
y6
=
_mm_add_pd
(
_mm_mul_pd
(
y6
,
h1
),
_mm_mul_pd
(
x6
,
h2
));
#endif
q1
=
_mm_load_pd
(
q
);
q1
=
_mm_load_pd
(
q
);
q1
=
_mm_add_pd
(
q1
,
y1
);
q1
=
_mm_add_pd
(
q1
,
y1
);
...
@@ -335,26 +279,7 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
...
@@ -335,26 +279,7 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
_mm_store_pd
(
&
q
[
10
],
q6
);
_mm_store_pd
(
&
q
[
10
],
q6
);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_macc_pd
(
y1
,
h2
,
x1
));
_mm_store_pd
(
&
q
[
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[
ldq
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_macc_pd
(
y2
,
h2
,
x2
));
_mm_store_pd
(
&
q
[
ldq
+
2
],
q2
);
q3
=
_mm_load_pd
(
&
q
[
ldq
+
4
]);
q3
=
_mm_add_pd
(
q3
,
_mm_macc_pd
(
y3
,
h2
,
x3
));
_mm_store_pd
(
&
q
[
ldq
+
4
],
q3
);
q4
=
_mm_load_pd
(
&
q
[
ldq
+
6
]);
q4
=
_mm_add_pd
(
q4
,
_mm_macc_pd
(
y4
,
h2
,
x4
));
_mm_store_pd
(
&
q
[
ldq
+
6
],
q4
);
q5
=
_mm_load_pd
(
&
q
[
ldq
+
8
]);
q5
=
_mm_add_pd
(
q5
,
_mm_macc_pd
(
y5
,
h2
,
x5
));
_mm_store_pd
(
&
q
[
ldq
+
8
],
q5
);
q6
=
_mm_load_pd
(
&
q
[
ldq
+
10
]);
q6
=
_mm_add_pd
(
q6
,
_mm_macc_pd
(
y6
,
h2
,
x6
));
_mm_store_pd
(
&
q
[
ldq
+
10
],
q6
);
#else
q1
=
_mm_load_pd
(
&
q
[
ldq
]);
q1
=
_mm_load_pd
(
&
q
[
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_add_pd
(
x1
,
_mm_mul_pd
(
y1
,
h2
)));
q1
=
_mm_add_pd
(
q1
,
_mm_add_pd
(
x1
,
_mm_mul_pd
(
y1
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
],
q1
);
_mm_store_pd
(
&
q
[
ldq
],
q1
);
...
@@ -373,32 +298,12 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
...
@@ -373,32 +298,12 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
q6
=
_mm_load_pd
(
&
q
[
ldq
+
10
]);
q6
=
_mm_load_pd
(
&
q
[
ldq
+
10
]);
q6
=
_mm_add_pd
(
q6
,
_mm_add_pd
(
x6
,
_mm_mul_pd
(
y6
,
h2
)));
q6
=
_mm_add_pd
(
q6
,
_mm_add_pd
(
x6
,
_mm_mul_pd
(
y6
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
+
10
],
q6
);
_mm_store_pd
(
&
q
[
ldq
+
10
],
q6
);
#endif
for
(
i
=
2
;
i
<
nb
;
i
++
)
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
{
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_macc_pd
(
x1
,
h1
,
_mm_mul_pd
(
y1
,
h2
)));
_mm_store_pd
(
&
q
[
i
*
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_macc_pd
(
x2
,
h1
,
_mm_mul_pd
(
y2
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
2
],
q2
);
q3
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
4
]);
q3
=
_mm_add_pd
(
q3
,
_mm_macc_pd
(
x3
,
h1
,
_mm_mul_pd
(
y3
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
4
],
q3
);
q4
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
6
]);
q4
=
_mm_add_pd
(
q4
,
_mm_macc_pd
(
x4
,
h1
,
_mm_mul_pd
(
y4
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
6
],
q4
);
q5
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
8
]);
q5
=
_mm_add_pd
(
q5
,
_mm_macc_pd
(
x5
,
h1
,
_mm_mul_pd
(
y5
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
8
],
q5
);
q6
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
10
]);
q6
=
_mm_add_pd
(
q6
,
_mm_macc_pd
(
x6
,
h1
,
_mm_mul_pd
(
y6
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
10
],
q6
);
#else
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_add_pd
(
_mm_mul_pd
(
x1
,
h1
),
_mm_mul_pd
(
y1
,
h2
)));
q1
=
_mm_add_pd
(
q1
,
_mm_add_pd
(
_mm_mul_pd
(
x1
,
h1
),
_mm_mul_pd
(
y1
,
h2
)));
_mm_store_pd
(
&
q
[
i
*
ldq
],
q1
);
_mm_store_pd
(
&
q
[
i
*
ldq
],
q1
);
...
@@ -417,30 +322,10 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
...
@@ -417,30 +322,10 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
q6
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
10
]);
q6
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
10
]);
q6
=
_mm_add_pd
(
q6
,
_mm_add_pd
(
_mm_mul_pd
(
x6
,
h1
),
_mm_mul_pd
(
y6
,
h2
)));
q6
=
_mm_add_pd
(
q6
,
_mm_add_pd
(
_mm_mul_pd
(
x6
,
h1
),
_mm_mul_pd
(
y6
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
10
],
q6
);
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
10
],
q6
);
#endif
}
}
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
q1
=
_mm_macc_pd
(
x1
,
h1
,
q1
);
_mm_store_pd
(
&
q
[
nb
*
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
q2
=
_mm_macc_pd
(
x2
,
h1
,
q2
);
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
2
],
q2
);
q3
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
4
]);
q3
=
_mm_macc_pd
(
x3
,
h1
,
q3
);
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
4
],
q3
);
q4
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
6
]);
q4
=
_mm_macc_pd
(
x4
,
h1
,
q4
);
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
6
],
q4
);
q5
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
8
]);
q5
=
_mm_macc_pd
(
x5
,
h1
,
q5
);
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
8
],
q5
);
q6
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
10
]);
q6
=
_mm_macc_pd
(
x6
,
h1
,
q6
);
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
10
],
q6
);
#else
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
q1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
_mm_store_pd
(
&
q
[
nb
*
ldq
],
q1
);
_mm_store_pd
(
&
q
[
nb
*
ldq
],
q1
);
...
@@ -459,7 +344,6 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
...
@@ -459,7 +344,6 @@ void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq,
q6
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
10
]);
q6
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
10
]);
q6
=
_mm_add_pd
(
q6
,
_mm_mul_pd
(
x6
,
h1
));
q6
=
_mm_add_pd
(
q6
,
_mm_mul_pd
(
x6
,
h1
));
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
10
],
q6
);
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
10
],
q6
);
#endif
}
}
/**
/**
...
@@ -487,16 +371,6 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
...
@@ -487,16 +371,6 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
__m128d
h1
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
__m128d
h1
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
__m128d
h2
;
__m128d
h2
;
#ifdef __ELPA_USE_FMA__
__m128d
q1
=
_mm_load_pd
(
q
);
__m128d
y1
=
_mm_macc_pd
(
x1
,
h1
,
q1
);
__m128d
q2
=
_mm_load_pd
(
&
q
[
2
]);
__m128d
y2
=
_mm_macc_pd
(
x2
,
h1
,
q2
);
__m128d
q3
=
_mm_load_pd
(
&
q
[
4
]);
__m128d
y3
=
_mm_macc_pd
(
x3
,
h1
,
q3
);
__m128d
q4
=
_mm_load_pd
(
&
q
[
6
]);
__m128d
y4
=
_mm_macc_pd
(
x4
,
h1
,
q4
);
#else
__m128d
q1
=
_mm_load_pd
(
q
);
__m128d
q1
=
_mm_load_pd
(
q
);
__m128d
y1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
__m128d
y1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
__m128d
q2
=
_mm_load_pd
(
&
q
[
2
]);
__m128d
q2
=
_mm_load_pd
(
&
q
[
2
]);
...
@@ -505,26 +379,12 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
...
@@ -505,26 +379,12 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
__m128d
y3
=
_mm_add_pd
(
q3
,
_mm_mul_pd
(
x3
,
h1
));
__m128d
y3
=
_mm_add_pd
(
q3
,
_mm_mul_pd
(
x3
,
h1
));
__m128d
q4
=
_mm_load_pd
(
&
q
[
6
]);
__m128d
q4
=
_mm_load_pd
(
&
q
[
6
]);
__m128d
y4
=
_mm_add_pd
(
q4
,
_mm_mul_pd
(
x4
,
h1
));
__m128d
y4
=
_mm_add_pd
(
q4
,
_mm_mul_pd
(
x4
,
h1
));
#endif
for
(
i
=
2
;
i
<
nb
;
i
++
)
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
{
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
x1
=
_mm_macc_pd
(
q1
,
h1
,
x1
);
y1
=
_mm_macc_pd
(
q1
,
h2
,
y1
);
q2
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
2
]);
x2
=
_mm_macc_pd
(
q2
,
h1
,
x2
);
y2
=
_mm_macc_pd
(
q2
,
h2
,
y2
);
q3
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
4
]);
x3
=
_mm_macc_pd
(
q3
,
h1
,
x3
);
y3
=
_mm_macc_pd
(
q3
,
h2
,
y3
);
q4
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
6
]);
x4
=
_mm_macc_pd
(
q4
,
h1
,
x4
);
y4
=
_mm_macc_pd
(
q4
,
h2
,
y4
);
#else
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
y1
=
_mm_add_pd
(
y1
,
_mm_mul_pd
(
q1
,
h2
));
y1
=
_mm_add_pd
(
y1
,
_mm_mul_pd
(
q1
,
h2
));
...
@@ -537,20 +397,10 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
...
@@ -537,20 +397,10 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
q4
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
6
]);
q4
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
6
]);
x4
=
_mm_add_pd
(
x4
,
_mm_mul_pd
(
q4
,
h1
));
x4
=
_mm_add_pd
(
x4
,
_mm_mul_pd
(
q4
,
h1
));
y4
=
_mm_add_pd
(
y4
,
_mm_mul_pd
(
q4
,
h2
));
y4
=
_mm_add_pd
(
y4
,
_mm_mul_pd
(
q4
,
h2
));
#endif
}
}
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
x1
=
_mm_macc_pd
(
q1
,
h1
,
x1
);
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
x2
=
_mm_macc_pd
(
q2
,
h1
,
x2
);
q3
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
4
]);
x3
=
_mm_macc_pd
(
q3
,
h1
,
x3
);
q4
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
6
]);
x4
=
_mm_macc_pd
(
q4
,
h1
,
x4
);
#else
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
...
@@ -559,7 +409,6 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
...
@@ -559,7 +409,6 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
x3
=
_mm_add_pd
(
x3
,
_mm_mul_pd
(
q3
,
h1
));
x3
=
_mm_add_pd
(
x3
,
_mm_mul_pd
(
q3
,
h1
));
q4
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
6
]);
q4
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
6
]);
x4
=
_mm_add_pd
(
x4
,
_mm_mul_pd
(
q4
,
h1
));
x4
=
_mm_add_pd
(
x4
,
_mm_mul_pd
(
q4
,
h1
));
#endif
/////////////////////////////////////////////////////
/////////////////////////////////////////////////////
// Rank-2 update of Q [8 x nb+1]
// Rank-2 update of Q [8 x nb+1]
...
@@ -576,17 +425,11 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
...
@@ -576,17 +425,11 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
x4
=
_mm_mul_pd
(
x4
,
h1
);
x4
=
_mm_mul_pd
(
x4
,
h1
);
h1
=
_mm_xor_pd
(
tau2
,
sign
);
h1
=
_mm_xor_pd
(
tau2
,
sign
);
h2
=
_mm_mul_pd
(
h1
,
vs
);
h2
=
_mm_mul_pd
(
h1
,
vs
);
#ifdef __ELPA_USE_FMA__
y1
=
_mm_macc_pd
(
y1
,
h1
,
_mm_mul_pd
(
x1
,
h2
));
y2
=
_mm_macc_pd
(
y2
,
h1
,
_mm_mul_pd
(
x2
,
h2
));
y3
=
_mm_macc_pd
(
y3
,
h1
,
_mm_mul_pd
(
x3
,
h2
));
y4
=
_mm_macc_pd
(
y4
,
h1
,
_mm_mul_pd
(
x4
,
h2
));
#else
y1
=
_mm_add_pd
(
_mm_mul_pd
(
y1
,
h1
),
_mm_mul_pd
(
x1
,
h2
));
y1
=
_mm_add_pd
(
_mm_mul_pd
(
y1
,
h1
),
_mm_mul_pd
(
x1
,
h2
));
y2
=
_mm_add_pd
(
_mm_mul_pd
(
y2
,
h1
),
_mm_mul_pd
(
x2
,
h2
));
y2
=
_mm_add_pd
(
_mm_mul_pd
(
y2
,
h1
),
_mm_mul_pd
(
x2
,
h2
));
y3
=
_mm_add_pd
(
_mm_mul_pd
(
y3
,
h1
),
_mm_mul_pd
(
x3
,
h2
));
y3
=
_mm_add_pd
(
_mm_mul_pd
(
y3
,
h1
),
_mm_mul_pd
(
x3
,
h2
));
y4
=
_mm_add_pd
(
_mm_mul_pd
(
y4
,
h1
),
_mm_mul_pd
(
x4
,
h2
));
y4
=
_mm_add_pd
(
_mm_mul_pd
(
y4
,
h1
),
_mm_mul_pd
(
x4
,
h2
));
#endif
q1
=
_mm_load_pd
(
q
);
q1
=
_mm_load_pd
(
q
);
q1
=
_mm_add_pd
(
q1
,
y1
);
q1
=
_mm_add_pd
(
q1
,
y1
);
...
@@ -602,20 +445,7 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
...
@@ -602,20 +445,7 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
_mm_store_pd
(
&
q
[
6
],
q4
);
_mm_store_pd
(
&
q
[
6
],
q4
);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_macc_pd
(
y1
,
h2
,
x1
));
_mm_store_pd
(
&
q
[
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[
ldq
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_macc_pd
(
y2
,
h2
,
x2
));
_mm_store_pd
(
&
q
[
ldq
+
2
],
q2
);
q3
=
_mm_load_pd
(
&
q
[
ldq
+
4
]);
q3
=
_mm_add_pd
(
q3
,
_mm_macc_pd
(
y3
,
h2
,
x3
));
_mm_store_pd
(
&
q
[
ldq
+
4
],
q3
);
q4
=
_mm_load_pd
(
&
q
[
ldq
+
6
]);
q4
=
_mm_add_pd
(
q4
,
_mm_macc_pd
(
y4
,
h2
,
x4
));
_mm_store_pd
(
&
q
[
ldq
+
6
],
q4
);
#else
q1
=
_mm_load_pd
(
&
q
[
ldq
]);
q1
=
_mm_load_pd
(
&
q
[
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_add_pd
(
x1
,
_mm_mul_pd
(
y1
,
h2
)));
q1
=
_mm_add_pd
(
q1
,
_mm_add_pd
(
x1
,
_mm_mul_pd
(
y1
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
],
q1
);
_mm_store_pd
(
&
q
[
ldq
],
q1
);
...
@@ -628,27 +458,12 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
...
@@ -628,27 +458,12 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
q4
=
_mm_load_pd
(
&
q
[
ldq
+
6
]);
q4
=
_mm_load_pd
(
&
q
[
ldq
+
6
]);
q4
=
_mm_add_pd
(
q4
,
_mm_add_pd
(
x4
,
_mm_mul_pd
(
y4
,
h2
)));
q4
=
_mm_add_pd
(
q4
,
_mm_add_pd
(
x4
,
_mm_mul_pd
(
y4
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
+
6
],
q4
);
_mm_store_pd
(
&
q
[
ldq
+
6
],
q4
);
#endif
for
(
i
=
2
;
i
<
nb
;
i
++
)
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
{
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_macc_pd
(
x1
,
h1
,
_mm_mul_pd
(
y1
,
h2
)));
_mm_store_pd
(
&
q
[
i
*
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_macc_pd
(
x2
,
h1
,
_mm_mul_pd
(
y2
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
2
],
q2
);
q3
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
4
]);
q3
=
_mm_add_pd
(
q3
,
_mm_macc_pd
(
x3
,
h1
,
_mm_mul_pd
(
y3
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
4
],
q3
);
q4
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
6
]);
q4
=
_mm_add_pd
(
q4
,
_mm_macc_pd
(
x4
,
h1
,
_mm_mul_pd
(
y4
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
6
],
q4
);
#else
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_add_pd
(
_mm_mul_pd
(
x1
,
h1
),
_mm_mul_pd
(
y1
,
h2
)));
q1
=
_mm_add_pd
(
q1
,
_mm_add_pd
(
_mm_mul_pd
(
x1
,
h1
),
_mm_mul_pd
(
y1
,
h2
)));
_mm_store_pd
(
&
q
[
i
*
ldq
],
q1
);
_mm_store_pd
(
&
q
[
i
*
ldq
],
q1
);
...
@@ -661,24 +476,10 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
...
@@ -661,24 +476,10 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
q4
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
6
]);
q4
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
6
]);
q4
=
_mm_add_pd
(
q4
,
_mm_add_pd
(
_mm_mul_pd
(
x4
,
h1
),
_mm_mul_pd
(
y4
,
h2
)));
q4
=
_mm_add_pd
(
q4
,
_mm_add_pd
(
_mm_mul_pd
(
x4
,
h1
),
_mm_mul_pd
(
y4
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
6
],
q4
);
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
6
],
q4
);
#endif
}
}
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
q1
=
_mm_macc_pd
(
x1
,
h1
,
q1
);
_mm_store_pd
(
&
q
[
nb
*
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
q2
=
_mm_macc_pd
(
x2
,
h1
,
q2
);
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
2
],
q2
);
q3
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
4
]);
q3
=
_mm_macc_pd
(
x3
,
h1
,
q3
);
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
4
],
q3
);
q4
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
6
]);
q4
=
_mm_macc_pd
(
x4
,
h1
,
q4
);
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
6
],
q4
);
#else
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
q1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
_mm_store_pd
(
&
q
[
nb
*
ldq
],
q1
);
_mm_store_pd
(
&
q
[
nb
*
ldq
],
q1
);
...
@@ -691,7 +492,6 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
...
@@ -691,7 +492,6 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
q4
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
6
]);
q4
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
6
]);
q4
=
_mm_add_pd
(
q4
,
_mm_mul_pd
(
x4
,
h1
));
q4
=
_mm_add_pd
(
q4
,
_mm_mul_pd
(
x4
,
h1
));
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
6
],
q4
);
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
6
],
q4
);
#endif
}
}
/**
/**
...
@@ -717,51 +517,30 @@ __forceinline void hh_trafo_kernel_4_SSE_2hv(double* q, double* hh, int nb, int
...
@@ -717,51 +517,30 @@ __forceinline void hh_trafo_kernel_4_SSE_2hv(double* q, double* hh, int nb, int
__m128d
h1
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
__m128d
h1
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
__m128d
h2
;
__m128d
h2
;
#ifdef __ELPA_USE_FMA__
__m128d
q1
=
_mm_load_pd
(
q
);
__m128d
y1
=
_mm_macc_pd
(
x1
,
h1
,
q1
);
__m128d
q2
=
_mm_load_pd
(
&
q
[
2
]);
__m128d
y2
=
_mm_macc_pd
(
x2
,
h1
,
q2
);
#else
__m128d
q1
=
_mm_load_pd
(
q
);
__m128d
q1
=
_mm_load_pd
(
q
);
__m128d
y1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
__m128d
y1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
__m128d
q2
=
_mm_load_pd
(
&
q
[
2
]);
__m128d
q2
=
_mm_load_pd
(
&
q
[
2
]);
__m128d
y2
=
_mm_add_pd
(
q2
,
_mm_mul_pd
(
x2
,
h1
));
__m128d
y2
=
_mm_add_pd
(
q2
,
_mm_mul_pd
(
x2
,
h1
));
#endif
for
(
i
=
2
;
i
<
nb
;
i
++
)
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
{
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
x1
=
_mm_macc_pd
(
q1
,
h1
,
x1
);
y1
=
_mm_macc_pd
(
q1
,
h2
,
y1
);
q2
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
2
]);
x2
=
_mm_macc_pd
(
q2
,
h1
,
x2
);
y2
=
_mm_macc_pd
(
q2
,
h2
,
y2
);
#else
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
y1
=
_mm_add_pd
(
y1
,
_mm_mul_pd
(
q1
,
h2
));
y1
=
_mm_add_pd
(
y1
,
_mm_mul_pd
(
q1
,
h2
));
q2
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
2
]);
q2
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
2
]);
x2
=
_mm_add_pd
(
x2
,
_mm_mul_pd
(
q2
,
h1
));
x2
=
_mm_add_pd
(
x2
,
_mm_mul_pd
(
q2
,
h1
));
y2
=
_mm_add_pd
(
y2
,
_mm_mul_pd
(
q2
,
h2
));
y2
=
_mm_add_pd
(
y2
,
_mm_mul_pd
(
q2
,
h2
));
#endif
}
}
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
#ifdef __ELPA_USE_FMA__
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
x1
=
_mm_macc_pd
(
q1
,
h1
,
x1
);
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
x2
=
_mm_macc_pd
(
q2
,
h1
,
x2
);
#else
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);