Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
567fc568
Commit
567fc568
authored
Nov 20, 2017
by
Andreas Marek
Browse files
Correct errors in VSX block2 kernel
parent
afa7e5e3
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Makefile.am
View file @
567fc568
...
...
@@ -251,12 +251,12 @@ endif
#endif
#endif
#
if
WITH_REAL_VSX_BLOCK4_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_vsx_4hv_double_precision.c
if
WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_vsx_4hv_single_precision.c
endif
endif
#
if WITH_REAL_VSX_BLOCK4_KERNEL
#
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_double_precision.c
#
if WANT_SINGLE_PRECISION_REAL
#
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_single_precision.c
#
endif
#
endif
if
WITH_REAL_SSE_BLOCK4_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_sse_4hv_double_precision.c
...
...
generate_gitlab_ci_tests.py
View file @
567fc568
...
...
@@ -623,8 +623,8 @@ for cc, fc, m, o, p, a, b, g, cov, instr, addr, na in product(
print
(
" - export SRUN_COMMANDLINE_CONFIGURE=
\"
--partition=$SLURMPARTITION --nodelist=$SLURMHOST --time=$CONFIGURETIME --constraint=$CONTSTRAINTS --mem=$REQUESTED_MEMORY
\"
"
)
print
(
" - export SRUN_COMMANDLINE_BUILD=
\"
--partition=$SLURMPARTITION --nodelist=$SLURMHOST --time=$BUILDTIME --constraint=$CONTSTRAINTS --mem=$REQUESTED_MEMORY
\"
"
)
print
(
" - export SRUN_COMMANDLINE_RUN=
\"
--partition=$SLURMPARTITION --nodelist=$SLURMHOST --time=$RUNTIME --constraint=$CONTSTRAINTS --mem=$REQUESTED_MEMORY
\"
"
)
print
(
" - echo
\"
srun --ntasks=1 --cpus-per-task=1 $SRUN_COMMANDLINE_CONFIGURE
\"
"
)
print
(
" - srun --threads-per-core=1 --ntasks-per-core=1 --ntasks=1 --cpus-per-task=1 $SRUN_COMMANDLINE_CONFIGURE"
\
print
(
" - echo
\"
srun
--cpu_bind=cores --hint=nomultithread
--ntasks=1 --cpus-per-task=1 $SRUN_COMMANDLINE_CONFIGURE
\"
"
)
print
(
" - srun
--cpu_bind=cores --hint=nomultithread
--threads-per-core=1 --ntasks-per-core=1 --ntasks=1 --cpus-per-task=1 $SRUN_COMMANDLINE_CONFIGURE"
\
+
" /scratch/elpa/bin/configure_elpa.sh"
\
+
"
\"
CC=
\\\"
"
+
c_compiler_wrapper
+
"
\\\"
"
+
" CFLAGS=
\\\"
"
+
CFLAGS
+
"
\\\"
"
\
+
" FC=
\\\"
"
+
fortran_compiler_wrapper
+
"
\\\"
"
+
" FCFLAGS=
\\\"
"
+
FCFLAGS
+
"
\\\"
"
\
...
...
@@ -638,8 +638,8 @@ for cc, fc, m, o, p, a, b, g, cov, instr, addr, na in product(
if
(
instr
==
"sse"
or
(
instr
==
"avx"
and
g
!=
"with-gpu"
)):
print
(
" - make -j 8"
)
if
(
instr
==
"avx2"
or
instr
==
"avx512"
or
instr
==
"knl"
or
g
==
"with-gpu"
):
print
(
" - echo
\"
srun --ntasks=1 --cpus-per-task=8 $SRUN_COMMANDLINE_BUILD
\"
"
)
print
(
" - srun --threads-per-core=1 --ntasks-per-core=1 --ntasks=1 --cpus-per-task=8 $SRUN_COMMANDLINE_BUILD /scratch/elpa/bin/build_elpa.sh"
)
print
(
" - echo
\"
srun
--cpu_bind=cores --hint=nomultithread
--ntasks=1 --cpus-per-task=8 $SRUN_COMMANDLINE_BUILD
\"
"
)
print
(
" - srun
--cpu_bind=cores --hint=nomultithread
--threads-per-core=1 --ntasks-per-core=1 --ntasks=1 --cpus-per-task=8 $SRUN_COMMANDLINE_BUILD /scratch/elpa/bin/build_elpa.sh"
)
# do the test
if
(
instr
==
"sse"
or
(
instr
==
"avx"
and
g
!=
"with-gpu"
)):
...
...
@@ -662,8 +662,8 @@ for cc, fc, m, o, p, a, b, g, cov, instr, addr, na in product(
openmp_threads
=
" 1 "
for
na
in
sorted
(
matrix_size
.
keys
(),
reverse
=
True
):
cores
=
set_number_of_cores
(
MPI_TASKS
,
o
)
print
(
" - echo
\"
srun --ntasks=1 --cpus-per-task="
+
str
(
cores
)
+
" $SRUN_COMMANDLINE_RUN
\"
"
)
print
(
" - srun --threads-per-core=1 --ntasks-per-core=1 --ntasks=1 --cpus-per-task="
+
str
(
cores
)
+
" $SRUN_COMMANDLINE_RUN
\
print
(
" - echo
\"
srun
--cpu_bind=cores --hint=nomultithread
--ntasks=1 --cpus-per-task="
+
str
(
cores
)
+
" $SRUN_COMMANDLINE_RUN
\"
"
)
print
(
" - srun
--cpu_bind=cores --hint=nomultithread
--threads-per-core=1 --ntasks-per-core=1 --ntasks=1 --cpus-per-task="
+
str
(
cores
)
+
" $SRUN_COMMANDLINE_RUN
\
/scratch/elpa/bin/run_elpa.sh "
+
str
(
MPI_TASKS
)
+
openmp_threads
+
"
\"
TEST_FLAGS=
\\\"
"
+
matrix_size
[
na
]
+
" "
+
str
(
nev
)
+
" "
+
str
(
nblk
)
+
"
\\\"
|| { cat test-suite.log; exit 1; }
\"
"
)
if
(
cov
==
"coverage"
):
...
...
src/elpa2/kernels/real_vsx_2hv_template.c
View file @
567fc568
...
...
@@ -55,7 +55,7 @@
#define __forceinline __attribute__((always_inline)) static
#ifdef DOUBLE_PRECISION_REAL
#define __SSE_DATATYPE __vector double
#define _SSE_LOAD vec_ld
#define _SSE_LOAD
(__vector double)
vec_ld
#define _SSE_ADD vec_add
#define _SSE_MUL vec_mul
#define _SSE_NEG vec_neg
...
...
@@ -65,7 +65,7 @@
#ifdef SINGLE_PRECISION_REAL
#define __SSE_DATATYPE __vector float
#define _SSE_LOAD vec_ld
#define _SSE_LOAD
(__vector float)
vec_ld
#define _SSE_ADD vec_add
#define _SSE_MUL vec_mul
#define _SSE_NEG vec_neg
...
...
@@ -298,6 +298,13 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
// hh contains two householder vectors, with offset 1
/////////////////////////////////////////////////////
int
i
;
#ifdef DOUBLE_PRECISION_REAL
double
mone
=
1
.
0
;
#endif
#ifdef SINGLE_PRECISION_REAL
float
mone
=
1
.
0
;
#endif
#ifdef HAVE_SSE_INTRINSICS
// Needed bit mask for floating point sign flip
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -308,12 +315,12 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
#endif
#endif
__SSE_DATATYPE
x1
=
_SSE_LOAD
(
0
,
&
q
[
ldq
]);
__SSE_DATATYPE
x2
=
_SSE_LOAD
(
0
,
&
q
[
ldq
+
offset
]);
__SSE_DATATYPE
x3
=
_SSE_LOAD
(
0
,
&
q
[
ldq
+
2
*
offset
]);
__SSE_DATATYPE
x4
=
_SSE_LOAD
(
0
,
&
q
[
ldq
+
3
*
offset
]);
__SSE_DATATYPE
x5
=
_SSE_LOAD
(
0
,
&
q
[
ldq
+
4
*
offset
]);
__SSE_DATATYPE
x6
=
_SSE_LOAD
(
0
,
&
q
[
ldq
+
5
*
offset
]);
__SSE_DATATYPE
x1
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
]);
__SSE_DATATYPE
x2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
+
offset
]);
__SSE_DATATYPE
x3
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
+
2
*
offset
]);
__SSE_DATATYPE
x4
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
+
3
*
offset
]);
__SSE_DATATYPE
x5
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
+
4
*
offset
]);
__SSE_DATATYPE
x6
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
+
5
*
offset
]);
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -334,17 +341,17 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
__SSE_DATATYPE
h2
;
__SSE_DATATYPE
q1
=
_SSE_LOAD
(
0
,
q
);
__SSE_DATATYPE
q1
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
0
]
);
__SSE_DATATYPE
y1
=
_SSE_ADD
(
q1
,
_SSE_MUL
(
x1
,
h1
));
__SSE_DATATYPE
q2
=
_SSE_LOAD
(
0
,
&
q
[
offset
]);
__SSE_DATATYPE
q2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
offset
]);
__SSE_DATATYPE
y2
=
_SSE_ADD
(
q2
,
_SSE_MUL
(
x2
,
h1
));
__SSE_DATATYPE
q3
=
_SSE_LOAD
(
0
,
&
q
[
2
*
offset
]);
__SSE_DATATYPE
q3
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
2
*
offset
]);
__SSE_DATATYPE
y3
=
_SSE_ADD
(
q3
,
_SSE_MUL
(
x3
,
h1
));
__SSE_DATATYPE
q4
=
_SSE_LOAD
(
0
,
&
q
[
3
*
offset
]);
__SSE_DATATYPE
q4
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
3
*
offset
]);
__SSE_DATATYPE
y4
=
_SSE_ADD
(
q4
,
_SSE_MUL
(
x4
,
h1
));
__SSE_DATATYPE
q5
=
_SSE_LOAD
(
0
,
&
q
[
4
*
offset
]);
__SSE_DATATYPE
q5
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
4
*
offset
]);
__SSE_DATATYPE
y5
=
_SSE_ADD
(
q5
,
_SSE_MUL
(
x5
,
h1
));
__SSE_DATATYPE
q6
=
_SSE_LOAD
(
0
,
&
q
[
5
*
offset
]);
__SSE_DATATYPE
q6
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
5
*
offset
]);
__SSE_DATATYPE
y6
=
_SSE_ADD
(
q6
,
_SSE_MUL
(
x6
,
h1
));
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
...
...
@@ -369,22 +376,22 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
#endif
#endif
q1
=
_SSE_LOAD
(
0
,
&
q
[
i
*
ldq
]);
q1
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
i
*
ldq
]);
x1
=
_SSE_ADD
(
x1
,
_SSE_MUL
(
q1
,
h1
));
y1
=
_SSE_ADD
(
y1
,
_SSE_MUL
(
q1
,
h2
));
q2
=
_SSE_LOAD
(
0
,
&
q
[(
i
*
ldq
)
+
offset
]);
q2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
i
*
ldq
)
+
offset
]);
x2
=
_SSE_ADD
(
x2
,
_SSE_MUL
(
q2
,
h1
));
y2
=
_SSE_ADD
(
y2
,
_SSE_MUL
(
q2
,
h2
));
q3
=
_SSE_LOAD
(
0
,
&
q
[(
i
*
ldq
)
+
2
*
offset
]);
q3
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
i
*
ldq
)
+
2
*
offset
]);
x3
=
_SSE_ADD
(
x3
,
_SSE_MUL
(
q3
,
h1
));
y3
=
_SSE_ADD
(
y3
,
_SSE_MUL
(
q3
,
h2
));
q4
=
_SSE_LOAD
(
0
,
&
q
[(
i
*
ldq
)
+
3
*
offset
]);
q4
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
i
*
ldq
)
+
3
*
offset
]);
x4
=
_SSE_ADD
(
x4
,
_SSE_MUL
(
q4
,
h1
));
y4
=
_SSE_ADD
(
y4
,
_SSE_MUL
(
q4
,
h2
));
q5
=
_SSE_LOAD
(
0
,
&
q
[(
i
*
ldq
)
+
4
*
offset
]);
q5
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
i
*
ldq
)
+
4
*
offset
]);
x5
=
_SSE_ADD
(
x5
,
_SSE_MUL
(
q5
,
h1
));
y5
=
_SSE_ADD
(
y5
,
_SSE_MUL
(
q5
,
h2
));
q6
=
_SSE_LOAD
(
0
,
&
q
[(
i
*
ldq
)
+
5
*
offset
]);
q6
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
i
*
ldq
)
+
5
*
offset
]);
x6
=
_SSE_ADD
(
x6
,
_SSE_MUL
(
q6
,
h1
));
y6
=
_SSE_ADD
(
y6
,
_SSE_MUL
(
q6
,
h2
));
}
...
...
@@ -401,21 +408,21 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
h1
=
vec_splats
(
hh
[
nb
-
1
]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1
=
vec_sp
a
lts
(
hh
[
nb
-
1
]);
h1
=
vec_spl
a
ts
(
hh
[
nb
-
1
]);
#endif
#endif
q1
=
_SSE_LOAD
(
0
,
&
q
[
nb
*
ldq
]);
q1
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
nb
*
ldq
]);
x1
=
_SSE_ADD
(
x1
,
_SSE_MUL
(
q1
,
h1
));
q2
=
_SSE_LOAD
(
0
,
&
q
[(
nb
*
ldq
)
+
offset
]);
q2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
nb
*
ldq
)
+
offset
]);
x2
=
_SSE_ADD
(
x2
,
_SSE_MUL
(
q2
,
h1
));
q3
=
_SSE_LOAD
(
0
,
&
q
[(
nb
*
ldq
)
+
2
*
offset
]);
q3
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
nb
*
ldq
)
+
2
*
offset
]);
x3
=
_SSE_ADD
(
x3
,
_SSE_MUL
(
q3
,
h1
));
q4
=
_SSE_LOAD
(
0
,
&
q
[(
nb
*
ldq
)
+
3
*
offset
]);
q4
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
nb
*
ldq
)
+
3
*
offset
]);
x4
=
_SSE_ADD
(
x4
,
_SSE_MUL
(
q4
,
h1
));
q5
=
_SSE_LOAD
(
0
,
&
q
[(
nb
*
ldq
)
+
4
*
offset
]);
q5
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
nb
*
ldq
)
+
4
*
offset
]);
x5
=
_SSE_ADD
(
x5
,
_SSE_MUL
(
q5
,
h1
));
q6
=
_SSE_LOAD
(
0
,
&
q
[(
nb
*
ldq
)
+
5
*
offset
]);
q6
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
nb
*
ldq
)
+
5
*
offset
]);
x6
=
_SSE_ADD
(
x6
,
_SSE_MUL
(
q6
,
h1
));
/////////////////////////////////////////////////////
// Rank-2 update of Q [12 x nb+1]
...
...
@@ -441,7 +448,7 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE
tau1
=
vec_splats
(
hh
[
0
]);
__SSE_DATATYPE
tau2
=
vec_splats
(
hh
[
ldh
]);
__SSE_DATATYPE
vs
=
vec_splats
s
(
s
);
__SSE_DATATYPE
vs
=
vec_splats
(
s
);
#endif
#endif
...
...
@@ -449,7 +456,8 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
h1
=
_SSE_XOR
(
tau1
,
sign
);
#endif
#ifdef HAVE_VSX_SSE
h1
=
vec_neg
(
tau1
);
//h1 = vec_neg(tau1);
h1
=
vec_mul
(
vec_splats
(
mone
),
tau1
);
#endif
x1
=
_SSE_MUL
(
x1
,
h1
);
x2
=
_SSE_MUL
(
x2
,
h1
);
...
...
@@ -461,7 +469,8 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
h1
=
_SSE_XOR
(
tau2
,
sign
);
#endif
#ifdef HAVE_SPARC64_SSE
h1
=
vec_neg
(
tau2
);
//h1 = vec_neg(tau2);
h1
=
vec_mul
(
vec_splats
(
mone
),
tau2
);
#endif
h2
=
_SSE_MUL
(
h1
,
vs
);
...
...
@@ -471,24 +480,24 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
y4
=
_SSE_ADD
(
_SSE_MUL
(
y4
,
h1
),
_SSE_MUL
(
x4
,
h2
));
y5
=
_SSE_ADD
(
_SSE_MUL
(
y5
,
h1
),
_SSE_MUL
(
x5
,
h2
));
y6
=
_SSE_ADD
(
_SSE_MUL
(
y6
,
h1
),
_SSE_MUL
(
x6
,
h2
));
q1
=
_SSE_LOAD
(
0
,
q
);
q1
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
0
]
);
q1
=
_SSE_ADD
(
q1
,
y1
);
_SSE_STORE
(
q1
,
0
,
q
);
q2
=
_SSE_LOAD
(
0
,
&
q
[
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q1
,
0
,
(
unsigned
int
*
)
&
q
[
0
]
);
q2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
offset
]);
q2
=
_SSE_ADD
(
q2
,
y2
);
_SSE_STORE
(
q2
,
0
,
&
q
[
offset
]);
q3
=
_SSE_LOAD
(
0
,
&
q
[
2
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q2
,
0
,
(
unsigned
int
*
)
&
q
[
offset
]);
q3
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
2
*
offset
]);
q3
=
_SSE_ADD
(
q3
,
y3
);
_SSE_STORE
(
q3
,
0
,
&
q
[
2
*
offset
]);
q4
=
_SSE_LOAD
(
0
,
&
q
[
3
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q3
,
0
,
(
unsigned
int
*
)
&
q
[
2
*
offset
]);
q4
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
3
*
offset
]);
q4
=
_SSE_ADD
(
q4
,
y4
);
_SSE_STORE
(
q4
,
0
,
&
q
[
3
*
offset
]);
q5
=
_SSE_LOAD
(
0
,
&
q
[
4
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q4
,
0
,
(
unsigned
int
*
)
&
q
[
3
*
offset
]);
q5
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
4
*
offset
]);
q5
=
_SSE_ADD
(
q5
,
y5
);
_SSE_STORE
(
q5
,
0
,
&
q
[
4
*
offset
]);
q6
=
_SSE_LOAD
(
0
,
&
q
[
5
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q5
,
0
,
(
unsigned
int
*
)
&
q
[
4
*
offset
]);
q6
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
5
*
offset
]);
q6
=
_SSE_ADD
(
q6
,
y6
);
_SSE_STORE
(
q6
,
0
,
&
q
[
5
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q6
,
0
,
(
unsigned
int
*
)
&
q
[
5
*
offset
]);
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -508,24 +517,24 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
#endif
#endif
q1
=
_SSE_LOAD
(
0
,
&
q
[
ldq
]);
q1
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
]);
q1
=
_SSE_ADD
(
q1
,
_SSE_ADD
(
x1
,
_SSE_MUL
(
y1
,
h2
)));
_SSE_STORE
(
q1
,
0
,
&
q
[
ldq
]);
q2
=
_SSE_LOAD
(
0
,
&
q
[
ldq
+
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q1
,
0
,
(
unsigned
int
*
)
&
q
[
ldq
]);
q2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
+
offset
]);
q2
=
_SSE_ADD
(
q2
,
_SSE_ADD
(
x2
,
_SSE_MUL
(
y2
,
h2
)));
_SSE_STORE
(
q2
,
0
,
&
q
[
ldq
+
offset
]);
q3
=
_SSE_LOAD
(
0
,
&
q
[
ldq
+
2
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q2
,
0
,
(
unsigned
int
*
)
&
q
[
ldq
+
offset
]);
q3
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
+
2
*
offset
]);
q3
=
_SSE_ADD
(
q3
,
_SSE_ADD
(
x3
,
_SSE_MUL
(
y3
,
h2
)));
_SSE_STORE
(
q3
,
0
,
&
q
[
ldq
+
2
*
offset
]);
q4
=
_SSE_LOAD
(
0
,
&
q
[
ldq
+
3
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q3
,
0
,
(
unsigned
int
*
)
&
q
[
ldq
+
2
*
offset
]);
q4
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
+
3
*
offset
]);
q4
=
_SSE_ADD
(
q4
,
_SSE_ADD
(
x4
,
_SSE_MUL
(
y4
,
h2
)));
_SSE_STORE
(
q4
,
0
,
&
q
[
ldq
+
3
*
offset
]);
q5
=
_SSE_LOAD
(
0
,
&
q
[
ldq
+
4
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q4
,
0
,
(
unsigned
int
*
)
&
q
[
ldq
+
3
*
offset
]);
q5
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
+
4
*
offset
]);
q5
=
_SSE_ADD
(
q5
,
_SSE_ADD
(
x5
,
_SSE_MUL
(
y5
,
h2
)));
_SSE_STORE
(
q5
,
0
,
&
q
[
ldq
+
4
*
offset
]);
q6
=
_SSE_LOAD
(
0
,
&
q
[
ldq
+
5
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q5
,
0
,
(
unsigned
int
*
)
&
q
[
ldq
+
4
*
offset
]);
q6
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
+
5
*
offset
]);
q6
=
_SSE_ADD
(
q6
,
_SSE_ADD
(
x6
,
_SSE_MUL
(
y6
,
h2
)));
_SSE_STORE
(
q6
,
0
,
&
q
[
ldq
+
5
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q6
,
0
,
(
unsigned
int
*
)
&
q
[
ldq
+
5
*
offset
]);
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
...
...
@@ -550,24 +559,24 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
#endif
#endif
q1
=
_SSE_LOAD
(
0
,
&
q
[
i
*
ldq
]);
q1
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
i
*
ldq
]);
q1
=
_SSE_ADD
(
q1
,
_SSE_ADD
(
_SSE_MUL
(
x1
,
h1
),
_SSE_MUL
(
y1
,
h2
)));
_SSE_STORE
(
q1
,
&
q
[
i
*
ldq
]);
q2
=
_SSE_LOAD
(
0
,
&
q
[(
i
*
ldq
)
+
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q1
,
0
,
(
unsigned
int
*
)
&
q
[
i
*
ldq
]);
q2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
i
*
ldq
)
+
offset
]);
q2
=
_SSE_ADD
(
q2
,
_SSE_ADD
(
_SSE_MUL
(
x2
,
h1
),
_SSE_MUL
(
y2
,
h2
)));
_SSE_STORE
(
q2
,
0
,
&
q
[(
i
*
ldq
)
+
offset
]);
q3
=
_SSE_LOAD
(
0
,
&
q
[(
i
*
ldq
)
+
2
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q2
,
0
,
(
unsigned
int
*
)
&
q
[(
i
*
ldq
)
+
offset
]);
q3
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
i
*
ldq
)
+
2
*
offset
]);
q3
=
_SSE_ADD
(
q3
,
_SSE_ADD
(
_SSE_MUL
(
x3
,
h1
),
_SSE_MUL
(
y3
,
h2
)));
_SSE_STORE
(
q3
,
0
,
&
q
[(
i
*
ldq
)
+
2
*
offset
]);
q4
=
_SSE_LOAD
(
0
,
&
q
[(
i
*
ldq
)
+
3
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q3
,
0
,
(
unsigned
int
*
)
&
q
[(
i
*
ldq
)
+
2
*
offset
]);
q4
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
i
*
ldq
)
+
3
*
offset
]);
q4
=
_SSE_ADD
(
q4
,
_SSE_ADD
(
_SSE_MUL
(
x4
,
h1
),
_SSE_MUL
(
y4
,
h2
)));
_SSE_STORE
(
q4
,
0
,
&
q
[(
i
*
ldq
)
+
3
*
offset
]);
q5
=
_SSE_LOAD
(
0
,
&
q
[(
i
*
ldq
)
+
4
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q4
,
0
,
(
unsigned
int
*
)
&
q
[(
i
*
ldq
)
+
3
*
offset
]);
q5
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
i
*
ldq
)
+
4
*
offset
]);
q5
=
_SSE_ADD
(
q5
,
_SSE_ADD
(
_SSE_MUL
(
x5
,
h1
),
_SSE_MUL
(
y5
,
h2
)));
_SSE_STORE
(
q5
,
0
,
&
q
[(
i
*
ldq
)
+
4
*
offset
]);
q6
=
_SSE_LOAD
(
0
,
&
q
[(
i
*
ldq
)
+
5
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q5
,
0
,
(
unsigned
int
*
)
&
q
[(
i
*
ldq
)
+
4
*
offset
]);
q6
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
i
*
ldq
)
+
5
*
offset
]);
q6
=
_SSE_ADD
(
q6
,
_SSE_ADD
(
_SSE_MUL
(
x6
,
h1
),
_SSE_MUL
(
y6
,
h2
)));
_SSE_STORE
(
q6
,
0
,
&
q
[(
i
*
ldq
)
+
5
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q6
,
0
,
(
unsigned
int
*
)
&
q
[(
i
*
ldq
)
+
5
*
offset
]);
}
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -587,24 +596,24 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
#endif
q1
=
_SSE_LOAD
(
0
,
&
q
[
nb
*
ldq
]);
q1
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
nb
*
ldq
]);
q1
=
_SSE_ADD
(
q1
,
_SSE_MUL
(
x1
,
h1
));
_SSE_STORE
(
q1
,
0
,
&
q
[
nb
*
ldq
]);
q2
=
_SSE_LOAD
(
0
,
&
q
[(
nb
*
ldq
)
+
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q1
,
0
,
(
unsigned
int
*
)
&
q
[
nb
*
ldq
]);
q2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
nb
*
ldq
)
+
offset
]);
q2
=
_SSE_ADD
(
q2
,
_SSE_MUL
(
x2
,
h1
));
_SSE_STORE
(
q2
,
0
,
&
q
[(
nb
*
ldq
)
+
offset
]);
q3
=
_SSE_LOAD
(
0
,
&
q
[(
nb
*
ldq
)
+
2
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q2
,
0
,
(
unsigned
int
*
)
&
q
[(
nb
*
ldq
)
+
offset
]);
q3
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
nb
*
ldq
)
+
2
*
offset
]);
q3
=
_SSE_ADD
(
q3
,
_SSE_MUL
(
x3
,
h1
));
_SSE_STORE
(
q3
,
0
,
&
q
[(
nb
*
ldq
)
+
2
*
offset
]);
q4
=
_SSE_LOAD
(
0
,
&
q
[(
nb
*
ldq
)
+
3
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q3
,
0
,
(
unsigned
int
*
)
&
q
[(
nb
*
ldq
)
+
2
*
offset
]);
q4
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
nb
*
ldq
)
+
3
*
offset
]);
q4
=
_SSE_ADD
(
q4
,
_SSE_MUL
(
x4
,
h1
));
_SSE_STORE
(
q4
,
0
,
&
q
[(
nb
*
ldq
)
+
3
*
offset
]);
q5
=
_SSE_LOAD
(
0
,
&
q
[(
nb
*
ldq
)
+
4
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q4
,
0
,
(
unsigned
int
*
)
&
q
[(
nb
*
ldq
)
+
3
*
offset
]);
q5
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
nb
*
ldq
)
+
4
*
offset
]);
q5
=
_SSE_ADD
(
q5
,
_SSE_MUL
(
x5
,
h1
));
_SSE_STORE
(
q5
,
0
,
&
q
[(
nb
*
ldq
)
+
4
*
offset
]);
q6
=
_SSE_LOAD
(
0
,
&
q
[(
nb
*
ldq
)
+
5
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q5
,
0
,
(
unsigned
int
*
)
&
q
[(
nb
*
ldq
)
+
4
*
offset
]);
q6
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
nb
*
ldq
)
+
5
*
offset
]);
q6
=
_SSE_ADD
(
q6
,
_SSE_MUL
(
x6
,
h1
));
_SSE_STORE
(
q6
,
0
,
&
q
[(
nb
*
ldq
)
+
5
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q6
,
0
,
(
unsigned
int
*
)
&
q
[(
nb
*
ldq
)
+
5
*
offset
]);
}
...
...
@@ -632,6 +641,13 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
// hh contains two householder vectors, with offset 1
/////////////////////////////////////////////////////
int
i
;
#ifdef DOUBLE_PRECISION_REAL
double
mone
=
1
.
0
;
#endif
#ifdef SINGLE_PRECISION_REAL
float
mone
=
1
.
0
;
#endif
#ifdef HAVE_SSE_INTRINSICS
// Needed bit mask for floating point sign flip
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -642,11 +658,11 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
#endif
#endif
__SSE_DATATYPE
x1
=
_SSE_LOAD
(
0
,
&
q
[
ldq
]);
__SSE_DATATYPE
x2
=
_SSE_LOAD
(
0
,
&
q
[
ldq
+
offset
]);
__SSE_DATATYPE
x3
=
_SSE_LOAD
(
0
,
&
q
[
ldq
+
2
*
offset
]);
__SSE_DATATYPE
x4
=
_SSE_LOAD
(
0
,
&
q
[
ldq
+
3
*
offset
]);
__SSE_DATATYPE
x5
=
_SSE_LOAD
(
0
,
&
q
[
ldq
+
4
*
offset
]);
__SSE_DATATYPE
x1
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
]);
__SSE_DATATYPE
x2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
+
offset
]);
__SSE_DATATYPE
x3
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
+
2
*
offset
]);
__SSE_DATATYPE
x4
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
+
3
*
offset
]);
__SSE_DATATYPE
x5
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
+
4
*
offset
]);
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -667,15 +683,15 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
__SSE_DATATYPE
h2
;
__SSE_DATATYPE
q1
=
_SSE_LOAD
(
0
,
q
);
__SSE_DATATYPE
q1
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
0
]
);
__SSE_DATATYPE
y1
=
_SSE_ADD
(
q1
,
_SSE_MUL
(
x1
,
h1
));
__SSE_DATATYPE
q2
=
_SSE_LOAD
(
0
,
&
q
[
offset
]);
__SSE_DATATYPE
q2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
offset
]);
__SSE_DATATYPE
y2
=
_SSE_ADD
(
q2
,
_SSE_MUL
(
x2
,
h1
));
__SSE_DATATYPE
q3
=
_SSE_LOAD
(
0
,
&
q
[
2
*
offset
]);
__SSE_DATATYPE
q3
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
2
*
offset
]);
__SSE_DATATYPE
y3
=
_SSE_ADD
(
q3
,
_SSE_MUL
(
x3
,
h1
));
__SSE_DATATYPE
q4
=
_SSE_LOAD
(
0
,
&
q
[
3
*
offset
]);
__SSE_DATATYPE
q4
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
3
*
offset
]);
__SSE_DATATYPE
y4
=
_SSE_ADD
(
q4
,
_SSE_MUL
(
x4
,
h1
));
__SSE_DATATYPE
q5
=
_SSE_LOAD
(
0
,
&
q
[
4
*
offset
]);
__SSE_DATATYPE
q5
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
4
*
offset
]);
__SSE_DATATYPE
y5
=
_SSE_ADD
(
q5
,
_SSE_MUL
(
x5
,
h1
));
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
...
...
@@ -701,19 +717,19 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
#endif
q1
=
_SSE_LOAD
(
0
,
&
q
[
i
*
ldq
]);
q1
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
i
*
ldq
]);
x1
=
_SSE_ADD
(
x1
,
_SSE_MUL
(
q1
,
h1
));
y1
=
_SSE_ADD
(
y1
,
_SSE_MUL
(
q1
,
h2
));
q2
=
_SSE_LOAD
(
0
,
&
q
[(
i
*
ldq
)
+
offset
]);
q2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
i
*
ldq
)
+
offset
]);
x2
=
_SSE_ADD
(
x2
,
_SSE_MUL
(
q2
,
h1
));
y2
=
_SSE_ADD
(
y2
,
_SSE_MUL
(
q2
,
h2
));
q3
=
_SSE_LOAD
(
0
,
&
q
[(
i
*
ldq
)
+
2
*
offset
]);
q3
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
i
*
ldq
)
+
2
*
offset
]);
x3
=
_SSE_ADD
(
x3
,
_SSE_MUL
(
q3
,
h1
));
y3
=
_SSE_ADD
(
y3
,
_SSE_MUL
(
q3
,
h2
));
q4
=
_SSE_LOAD
(
0
,
&
q
[(
i
*
ldq
)
+
3
*
offset
]);
q4
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
i
*
ldq
)
+
3
*
offset
]);
x4
=
_SSE_ADD
(
x4
,
_SSE_MUL
(
q4
,
h1
));
y4
=
_SSE_ADD
(
y4
,
_SSE_MUL
(
q4
,
h2
));
q5
=
_SSE_LOAD
(
0
,
&
q
[(
i
*
ldq
)
+
4
*
offset
]);
q5
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
i
*
ldq
)
+
4
*
offset
]);
x5
=
_SSE_ADD
(
x5
,
_SSE_MUL
(
q5
,
h1
));
y5
=
_SSE_ADD
(
y5
,
_SSE_MUL
(
q5
,
h2
));
}
...
...
@@ -731,19 +747,19 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
h1
=
vec_splats
(
hh
[
nb
-
1
]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1
=
ve
x
_splats
(
hh
[
nb
-
1
]);
h1
=
ve
c
_splats
(
hh
[
nb
-
1
]);
#endif
#endif
q1
=
_SSE_LOAD
(
0
,
&
q
[
nb
*
ldq
]);
q1
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
nb
*
ldq
]);
x1
=
_SSE_ADD
(
x1
,
_SSE_MUL
(
q1
,
h1
));
q2
=
_SSE_LOAD
(
0
,
&
q
[(
nb
*
ldq
)
+
offset
]);
q2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
nb
*
ldq
)
+
offset
]);
x2
=
_SSE_ADD
(
x2
,
_SSE_MUL
(
q2
,
h1
));
q3
=
_SSE_LOAD
(
0
,
&
q
[(
nb
*
ldq
)
+
2
*
offset
]);
q3
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
nb
*
ldq
)
+
2
*
offset
]);
x3
=
_SSE_ADD
(
x3
,
_SSE_MUL
(
q3
,
h1
));
q4
=
_SSE_LOAD
(
0
,
&
q
[(
nb
*
ldq
)
+
3
*
offset
]);
q4
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
nb
*
ldq
)
+
3
*
offset
]);
x4
=
_SSE_ADD
(
x4
,
_SSE_MUL
(
q4
,
h1
));
q5
=
_SSE_LOAD
(
0
,
&
q
[(
nb
*
ldq
)
+
4
*
offset
]);
q5
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
nb
*
ldq
)
+
4
*
offset
]);
x5
=
_SSE_ADD
(
x5
,
_SSE_MUL
(
q5
,
h1
));
/////////////////////////////////////////////////////
// Rank-2 update of Q [12 x nb+1]
...
...
@@ -768,7 +784,7 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
__SSE_DATATYPE
vs
=
vec_splats
(
s
);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE
tau1
=
vec_splats
(
hh
[
0
],
hh
[
0
]);
__SSE_DATATYPE
tau1
=
vec_splats
(
hh
[
0
]);
__SSE_DATATYPE
tau2
=
vec_splats
(
hh
[
ldh
]);
__SSE_DATATYPE
vs
=
vec_splats
(
s
);
...
...
@@ -779,7 +795,8 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
h1
=
_SSE_XOR
(
tau1
,
sign
);
#endif
#ifdef HAVE_VSX_SSE
h1
=
vec_neg
(
tau1
);
h1
=
vec_mul
(
vec_splats
(
mone
),
tau1
);
// h1 = vec_neg(tau1);
#endif
x1
=
_SSE_MUL
(
x1
,
h1
);
x2
=
_SSE_MUL
(
x2
,
h1
);
...
...
@@ -790,7 +807,8 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
h1
=
_SSE_XOR
(
tau2
,
sign
);
#endif
#ifdef HAVE_VSX_SSE
h1
=
vec_neg
(
tau2
);
// h1 = vec_neg(tau2);
h1
=
vec_mul
(
vec_splats
(
mone
),
tau2
);
#endif
h2
=
_SSE_MUL
(
h1
,
vs
);
...
...
@@ -799,21 +817,21 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
y3
=
_SSE_ADD
(
_SSE_MUL
(
y3
,
h1
),
_SSE_MUL
(
x3
,
h2
));
y4
=
_SSE_ADD
(
_SSE_MUL
(
y4
,
h1
),
_SSE_MUL
(
x4
,
h2
));
y5
=
_SSE_ADD
(
_SSE_MUL
(
y5
,
h1
),
_SSE_MUL
(
x5
,
h2
));
q1
=
_SSE_LOAD
(
0
,
q
);
q1
=
_SSE_LOAD
(
0
,
(
unsigned
int
*
)
&
q
[
0
]
);
q1
=
_SSE_ADD
(
q1
,
y1
);
_SSE_STORE
(
q1
,
0
,
q
);
q2
=
_SSE_LOAD
(
0
,
&
q
[
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q1
,
0
,
(
unsigned
int
*
)
&
q
[
0
]
);
q2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
offset
]);
q2
=
_SSE_ADD
(
q2
,
y2
);
_SSE_STORE
(
q2
,
0
,
&
q
[
offset
]);
q3
=
_SSE_LOAD
(
0
,
&
q
[
2
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q2
,
0
,
(
unsigned
int
*
)
&
q
[
offset
]);
q3
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
2
*
offset
]);
q3
=
_SSE_ADD
(
q3
,
y3
);
_SSE_STORE
(
q3
,
0
,
&
q
[
2
*
offset
]);
q4
=
_SSE_LOAD
(
0
,
&
q
[
3
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q3
,
0
,
(
unsigned
int
*
)
&
q
[
2
*
offset
]);
q4
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
3
*
offset
]);
q4
=
_SSE_ADD
(
q4
,
y4
);
_SSE_STORE
(
q4
,
0
,
&
q
[
3
*
offset
]);
q5
=
_SSE_LOAD
(
0
,
&
q
[
4
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q4
,
0
,
(
unsigned
int
*
)
&
q
[
3
*
offset
]);
q5
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
4
*
offset
]);
q5
=
_SSE_ADD
(
q5
,
y5
);
_SSE_STORE
(
q5
,
0
,
&
q
[
4
*
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q5
,
0
,
(
unsigned
int
*
)
&
q
[
4
*
offset
]);
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -832,21 +850,21 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
#endif
#endif
q1
=
_SSE_LOAD
(
0
,
&
q
[
ldq
]);
q1
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
]);
q1
=
_SSE_ADD
(
q1
,
_SSE_ADD
(
x1
,
_SSE_MUL
(
y1
,
h2
)));
_SSE_STORE
(
q1
,
0
,
&
q
[
ldq
]);
q2
=
_SSE_LOAD
(
0
,
&
q
[
ldq
+
offset
]);
_SSE_STORE
(
(
__vector
unsigned
int
)
q1
,
0
,
(
unsigned
int
*
)
&
q
[
ldq
]);
q2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
+
offset
]);