Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
c38695bd
Commit
c38695bd
authored
Nov 23, 2017
by
Andreas Marek
Browse files
Double precision real block6 kernel for Sparc64
parent
46ed16fd
Changes
9
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
Makefile.am
View file @
c38695bd
...
...
@@ -244,12 +244,12 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
#
if WITH_REAL_SPARC64_BLOCK4_KERNEL
#
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_double_precision.c
if
WITH_REAL_SPARC64_BLOCK4_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_sparc64_4hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_single_precision.c
#endif
#
endif
endif
if
WITH_REAL_VSX_BLOCK4_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_vsx_4hv_double_precision.c
...
...
@@ -288,9 +288,9 @@ endif
if
WITH_REAL_SPARC64_BLOCK6_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_sparc64_6hv_double_precision.c
if
WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2/kernels/real_sparc64_6hv_single_precision.c
endif
#
if WANT_SINGLE_PRECISION_REAL
#
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_single_precision.c
#
endif
endif
if
WITH_REAL_VSX_BLOCK6_KERNEL
...
...
@@ -689,6 +689,9 @@ EXTRA_DIST = \
src/elpa2/kernels/real_vsx_2hv_template.c
\
src/elpa2/kernels/real_vsx_4hv_template.c
\
src/elpa2/kernels/real_vsx_6hv_template.c
\
src/elpa2/kernels/real_sparc64_2hv_template.c
\
src/elpa2/kernels/real_sparc64_4hv_template.c
\
src/elpa2/kernels/real_sparc64_6hv_template.c
\
src/elpa2/kernels/real_sse_2hv_template.c
\
src/elpa2/kernels/real_sse_4hv_template.c
\
src/elpa2/kernels/real_sse_6hv_template.c
\
...
...
configure.ac
View file @
c38695bd
...
...
@@ -470,8 +470,6 @@ m4_define(elpa_m4_sparc64_kernels, [
real_sparc64_block2
real_sparc64_block4
real_sparc64_block6
complex_sparc64_block1
complex_sparc64_block2
])
m4_define(elpa_m4_vsx_kernels, [
...
...
elpa/elpa_constants.h.in
View file @
c38695bd
...
...
@@ -72,9 +72,7 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2, 11, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1, 12, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2, 13, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1, 15, @ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2, 16, @ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
...
...
src/elpa2/compute_hh_trafo.F90
View file @
c38695bd
...
...
@@ -628,32 +628,32 @@
! sparc64 block1 complex kernel
#if defined(WITH_COMPLEX_SPARC64_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1
)
then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL))
ttt
=
mpi_wtime
()
do
j
=
ncols
,
1
,
-1
#ifdef WITH_OPENMP
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sparc64_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
,
my_thread
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#else
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sparc64_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
,
istripe
)),
bcast_buffer
(
1
,
j
+
off
),
nbw
,
nl
,
stripe_width
)
#endif
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)) */
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif
! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
#ifndef WITH_FIXED_COMPLEX_KERNEL
!
if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1) then
!
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
!
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL))
!
ttt = mpi_wtime()
!
do j = ncols, 1, -1
!
#ifdef WITH_OPENMP
!
call single_hh_trafo_&
!
&MATH_DATATYPE&
!
&_sparc64_1hv_&
!
&PRECISION&
!
& (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!
#else
!
call single_hh_trafo_&
!
&MATH_DATATYPE&
!
&_sparc64_1hv_&
!
&PRECISION&
!
& (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!
#endif
!
enddo
!
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)) */
!
!
#ifndef WITH_FIXED_COMPLEX_KERNEL
!
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1)
!
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SPARC64_BLOCK1_KERNEL */
#endif /* COMPLEXCASE */
...
...
@@ -918,45 +918,45 @@
! implementation of sparc64 block 2 complex case
#if defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2
)
then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
ttt
=
mpi_wtime
()
do
j
=
ncols
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
#ifdef WITH_OPENMP
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sparc64_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sparc64_2hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
)),
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
#ifdef WITH_OPENMP
if
(
j
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sparc64_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
1
+
off
+
a_off
,
istripe
,
my_thread
)),
bcast_buffer
(
1
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#else
if
(
j
==
1
)
call
single_hh_trafo_
&
&
MATH_DATATYPE
&
&
_
sparc64_1hv_
&
&
PRECISION
&
&
(
c_loc
(
a
(
1
,
1
+
off
+
a_off
,
istripe
)),
bcast_buffer
(
1
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#endif
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif
! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
#ifndef WITH_FIXED_COMPLEX_KERNEL
!
if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2) then
!
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
!
ttt = mpi_wtime()
!
do j = ncols, 2, -2
!
w(:,1) = bcast_buffer(1:nbw,j+off)
!
w(:,2) = bcast_buffer(1:nbw,j+off-1)
!
#ifdef WITH_OPENMP
!
call double_hh_trafo_&
!
&MATH_DATATYPE&
!
&_sparc64_2hv_&
!
&PRECISION&
!
& (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
!
#else
!
call double_hh_trafo_&
!
&MATH_DATATYPE&
!
&_sparc64_2hv_&
!
&PRECISION&
!
& (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
!
#endif
!
enddo
!
#ifdef WITH_OPENMP
!
if (j==1) call single_hh_trafo_&
!
&MATH_DATATYPE&
!
&_sparc64_1hv_&
!
&PRECISION&
!
& (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
!
#else
!
if (j==1) call single_hh_trafo_&
!
&MATH_DATATYPE&
!
&_sparc64_1hv_&
!
&PRECISION&
!
& (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
!
#endif
!
!
#ifndef WITH_FIXED_COMPLEX_KERNEL
!
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2)
!
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SPARC64_BLOCK2_KERNEL */
#endif /* COMPLEXCASE == 1 */
...
...
src/elpa2/elpa2_template.F90
View file @
c38695bd
...
...
@@ -192,6 +192,14 @@
write
(
error_unit
,
*
)
"The GENERIC kernel will be used at the moment"
kernel
=
ELPA_2STAGE_REAL_GENERIC
endif
! special case at the moment NO single precision kernels on SPARC64 -> set GENERIC for now
if
(
kernel
.eq.
ELPA_2STAGE_REAL_SPARC64_BLOCK2
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SPARC64_BLOCK4
.or.
&
kernel
.eq.
ELPA_2STAGE_REAL_SPARC64_BLOCK6
)
then
write
(
error_unit
,
*
)
"ELPA: At the moment there exist no specific SINGLE precision kernels for SPARC64"
write
(
error_unit
,
*
)
"The GENERIC kernel will be used at the moment"
kernel
=
ELPA_2STAGE_REAL_GENERIC
endif
#endif
#endif
...
...
src/elpa2/kernels/real_sse_2hv_template.c
View file @
c38695bd
...
...
@@ -44,6 +44,7 @@
//
// Author: Andreas Marek, MPCDF, based on the double precision case of A. Heinecke
//
#include
"config-f90.h"
#ifdef HAVE_SSE_INTRINSICS
#include
<x86intrin.h>
...
...
@@ -82,6 +83,7 @@
#undef __AVX__
#endif
#ifdef HAVE_SSE_INTRINSICS
//Forward declaration
#ifdef DOUBLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_2_SSE_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
);
...
...
@@ -99,14 +101,46 @@ __forceinline void hh_trafo_kernel_16_SSE_2hv_single(float* q, float* hh, int nb
__forceinline
void
hh_trafo_kernel_20_SSE_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
);
__forceinline
void
hh_trafo_kernel_24_SSE_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
//Forward declaration
#ifdef DOUBLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_2_SPARC64_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
);
__forceinline
void
hh_trafo_kernel_4_SPARC64_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
);
__forceinline
void
hh_trafo_kernel_6_SPARC64_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
);
__forceinline
void
hh_trafo_kernel_8_SPARC64_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
);
__forceinline
void
hh_trafo_kernel_10_SPARC64_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
);
__forceinline
void
hh_trafo_kernel_12_SPARC64_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
);
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_4_SPARC64_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
);
__forceinline
void
hh_trafo_kernel_8_SPARC64_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
);
__forceinline
void
hh_trafo_kernel_12_SPARC64_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
);
__forceinline
void
hh_trafo_kernel_16_SPARC64_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
);
__forceinline
void
hh_trafo_kernel_20_SPARC64_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
);
__forceinline
void
hh_trafo_kernel_24_SPARC64_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
);
#endif
#endif
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
void
double_hh_trafo_real_sse_2hv_double
(
double
*
q
,
double
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
);
#endif
#ifdef SINGLE_PRECISION_REAL
void
double_hh_trafo_real_sse_2hv_single_
(
float
*
q
,
float
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
void
double_hh_trafo_real_sparc64_2hv_double
(
double
*
q
,
double
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
);
#endif
#ifdef SINGLE_PRECISION_REAL
void
double_hh_trafo_real_sparc64_2hv_single_
(
float
*
q
,
float
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
);
#endif
#endif
/*
!f>#ifdef HAVE_SPARC64_SSE
...
...
@@ -210,14 +244,24 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef DOUBLE_PRECISION_REAL
for
(
i
=
0
;
i
<
nq
-
10
;
i
+=
12
)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_12_SSE_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_12_SPARC64_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
worked_on
+=
12
;
}
#endif
#ifdef SINGLE_PRECISION_REAL
for
(
i
=
0
;
i
<
nq
-
20
;
i
+=
24
)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_24_SSE_2hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_24_SPARC64_2hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
worked_on
+=
24
;
}
#endif
...
...
@@ -230,7 +274,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef DOUBLE_PRECISION_REAL
if
(
nq
-
i
==
10
)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_10_SSE_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_10_SPARC64_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
worked_on
+=
10
;
}
#endif
...
...
@@ -238,7 +287,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef SINGLE_PRECISION_REAL
if
(
nq
-
i
==
20
)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_20_SSE_2hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_20_SPARC64_2hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
worked_on
+=
20
;
}
#endif
...
...
@@ -246,7 +300,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef DOUBLE_PRECISION_REAL
if
(
nq
-
i
==
8
)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_8_SSE_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_8_SPARC64_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
worked_on
+=
8
;
}
#endif
...
...
@@ -254,7 +313,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef SINGLE_PRECISION_REAL
if
(
nq
-
i
==
16
)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_16_SSE_2hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_16_SPARC64_2hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
worked_on
+=
16
;
}
#endif
...
...
@@ -263,7 +327,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef DOUBLE_PRECISION_REAL
if
(
nq
-
i
==
6
)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_6_SSE_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_6_SPARC64_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
worked_on
+=
6
;
}
#endif
...
...
@@ -271,7 +340,13 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef SINGLE_PRECISION_REAL
if
(
nq
-
i
==
12
)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_12_SSE_2hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_12_SPARC64_2hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
worked_on
+=
12
;
}
#endif
...
...
@@ -279,7 +354,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef DOUBLE_PRECISION_REAL
if
(
nq
-
i
==
4
)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_4_SSE_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_4_SPARC64_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
worked_on
+=
4
;
}
#endif
...
...
@@ -287,7 +367,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef SINGLE_PRECISION_REAL
if
(
nq
-
i
==
8
)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_8_SSE_2hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_8_SPARC64_2hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
worked_on
+=
8
;
}
#endif
...
...
@@ -295,7 +380,12 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef DOUBLE_PRECISION_REAL
if
(
nq
-
i
==
2
)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_2_SSE_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_2_SPARC64_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
worked_on
+=
2
;
}
#endif
...
...
@@ -303,14 +393,26 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
#ifdef SINGLE_PRECISION_REAL
if
(
nq
-
i
==
4
)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_4_SSE_2hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_4_SPARC64_2hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
worked_on
+=
4
;
}
#endif
#ifdef WITH_DEBUG
if
(
worked_on
!=
nq
)
{
#ifdef HAVE_SSE_INTRINSICS
printf
(
"Error in real SSE BLOCK2 kernel %d %d
\n
"
,
worked_on
,
nq
);
#endif
#ifdef HAVE_SPARC64_SSE
printf
(
"Error in real SPARC64 BLOCK2 kernel %d %d
\n
"
,
worked_on
,
nq
);
#endif
abort
();
}
#endif
...
...
@@ -327,12 +429,22 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_12_SSE_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_24_SSE_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
)
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_12_SPARC64_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_24_SPARC64_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
)
#endif
#endif
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [12 x nb+1] * hh
...
...
@@ -661,11 +773,22 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_10_SSE_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_20_SSE_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
)
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_10_SPARC64_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_20_SPARC64_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
)
#endif
#endif
{
/////////////////////////////////////////////////////
...
...
@@ -973,11 +1096,22 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_8_SSE_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_16_SSE_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
)
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_8_SPARC64_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_16_SPARC64_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
)
#endif
#endif
{
/////////////////////////////////////////////////////
...
...
@@ -1262,11 +1396,22 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_6_SSE_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_12_SSE_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
)
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_6_SPARC64_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_12_SPARC64_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
)
#endif
#endif
{
/////////////////////////////////////////////////////
...
...
@@ -1531,11 +1676,22 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_4_SSE_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_8_SSE_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
)
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_4_SPARC64_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_8_SPARC64_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
)
#endif
#endif
{
/////////////////////////////////////////////////////
...
...
@@ -1775,11 +1931,22 @@ void double_hh_trafo_real_sparc64_2hv_single(float* q, float* hh, int* pnb, int*
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_2_SSE_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_4_SSE_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
)
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_2_SPARC64_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_4_SPARC64_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
)
#endif
#endif
{
/////////////////////////////////////////////////////
...
...
src/elpa2/kernels/real_sse_4hv_template.c
View file @
c38695bd
...
...
@@ -759,7 +759,7 @@ __forceinline void hh_trafo_kernel_12_SPARC64_4hv_single(float* q, float* hh, in
h1
=
tau1
;
x1
=
_SSE_MUL
(
x1
,
h1
);
x2
=
_SSE_MUL
(
x2
,
h1
);
x3
=
_SSE_MUL
(
x3
,
h1
)
x3
=
_SSE_MUL
(
x3
,
h1
)
;
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -944,7 +944,7 @@ __forceinline void hh_trafo_kernel_12_SPARC64_4hv_single(float* q, float* hh, in
#endif
#endif
#ifdef HAVE_SPARC64_
INTRINSICS
#ifdef HAVE_SPARC64_
SSE
#ifdef DOUBLE_PRECISION_REAL
h3
=
_mm_set_pd
(
hh
[(
ldh
*
2
)
+
1
],
hh
[(
ldh
*
2
)
+
1
]);
#endif
...
...
@@ -1164,7 +1164,7 @@ __forceinline void hh_trafo_kernel_12_SPARC64_4hv_single(float* q, float* hh, in
q2
=
_SSE_SUB
(
q2
,
_SSE_MUL
(
x2
,
h1
));
q3
=
_SSE_SUB
(
q3
,
_SSE_MUL
(
x3
,
h1
));
#ifdef HAVE_SSE_INTRINSCS
#ifdef HAVE_SSE_INTRINS
I
CS
#ifdef DOUBLE_PRECISION_REAL
h2
=
_mm_set1_pd
(
hh
[
ldh
+
nb
-
2
]);
#endif
...
...
@@ -2248,7 +2248,7 @@ __forceinline void hh_trafo_kernel_4_SPARC64_4hv_single(float* q, float* hh, int
#endif