Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
elpa
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
11
Issues
11
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Environments
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
elpa
elpa
Commits
e54b7632
Commit
e54b7632
authored
Jun 06, 2017
by
Andreas Marek
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Start to unfiy double/single real sse block2 kernel
parent
ebb92453
Changes
6
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
881 additions
and
976 deletions
+881
-976
Makefile.am
Makefile.am
+1
-0
generated_headers.am
generated_headers.am
+1
-1
src/elpa2/kernels/elpa2_kernels_real_sse_2hv_double_precision.c
...pa2/kernels/elpa2_kernels_real_sse_2hv_double_precision.c
+7
-525
src/elpa2/kernels/elpa2_kernels_real_sse_2hv_single_precision.c
...pa2/kernels/elpa2_kernels_real_sse_2hv_single_precision.c
+8
-449
src/elpa2/kernels/elpa2_kernels_real_sse_2hv_template.Xc
src/elpa2/kernels/elpa2_kernels_real_sse_2hv_template.Xc
+861
-0
src/general/precision_macros.h
src/general/precision_macros.h
+3
-1
No files found.
Makefile.am
View file @
e54b7632
...
...
@@ -589,6 +589,7 @@ EXTRA_DIST = \
src/elpa2/kernels/elpa2_kernels_real_template.X90
\
src/elpa2/kernels/elpa2_kernels_complex_template.X90
\
src/elpa2/kernels/elpa2_kernels_simple_template.X90
\
src/elpa2/kernels/elpa2_kernels_real_sse_2hv_template.Xc
\
src/elpa2/redist_band.X90
\
src/elpa2/pack_unpack_cpu.X90
\
src/elpa2/pack_unpack_gpu.X90
\
...
...
generated_headers.am
View file @
e54b7632
...
...
@@ -28,7 +28,7 @@ test/shared/generated.h: $(wildcard $(top_srcdir)/test/shared/*.*90) | test/shar
@
rm
-f
$@
$(
call
extract_interface,!c>
)
elpa/elpa_generated_fortran_interfaces.h
:
$(wildcard $(top_srcdir)/src/elpa2/kernels/*.c $(top_srcdir)/src/elpa2/kernels/*.s $(top_srcdir)/src/*.[ch]) | elpa
elpa/elpa_generated_fortran_interfaces.h
:
$(wildcard $(top_srcdir)/src/elpa2/kernels/*.
*
c $(top_srcdir)/src/elpa2/kernels/*.s $(top_srcdir)/src/*.[ch]) | elpa
@
rm
-f
$@
$(
call
extract_interface,!f>
)
$(
call
extract_interface,#!f>
)
...
...
src/elpa2/kernels/elpa2_kernels_real_sse_2hv_double_precision.c
View file @
e54b7632
...
...
@@ -42,532 +42,14 @@
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
// Author: Andreas Marek, MPCDF
#include "config-f90.h"
#include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif
//Forward declaration
__forceinline
void
hh_trafo_kernel_4_SSE_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
);
__forceinline
void
hh_trafo_kernel_8_SSE_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
);
__forceinline
void
hh_trafo_kernel_12_SSE_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
);
void
double_hh_trafo_real_sse_2hv_double
(
double
*
q
,
double
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
);
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine double_hh_trafo_real_sse_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_real_sse_2hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_double) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
void
double_hh_trafo_real_sse_2hv_double
(
double
*
q
,
double
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
)
{
int
i
;
int
nb
=
*
pnb
;
int
nq
=
*
pldq
;
int
ldq
=
*
pldq
;
int
ldh
=
*
pldh
;
// calculating scalar product to compute
// 2 householder vectors simultaneously
double
s
=
hh
[(
ldh
)
+
1
]
*
1
.
0
;
#pragma ivdep
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
s
+=
hh
[
i
-
1
]
*
hh
[(
i
+
ldh
)];
}
// Production level kernel calls with padding
for
(
i
=
0
;
i
<
nq
-
8
;
i
+=
12
)
{
hh_trafo_kernel_12_SSE_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
}
if
(
nq
==
i
)
{
return
;
}
else
{
if
(
nq
-
i
>
4
)
{
hh_trafo_kernel_8_SSE_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
}
else
if
(
nq
-
i
>
0
)
{
hh_trafo_kernel_4_SSE_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
}
}
}
/**
* Unrolled kernel that computes
* 12 rows of Q simultaneously, a
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline
void
hh_trafo_kernel_12_SSE_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [12 x nb+1] * hh
// hh contains two householder vectors, with offset 1
/////////////////////////////////////////////////////
int
i
;
// Needed bit mask for floating point sign flip
__m128d
sign
=
(
__m128d
)
_mm_set1_epi64x
(
0x8000000000000000LL
);
__m128d
x1
=
_mm_load_pd
(
&
q
[
ldq
]);
__m128d
x2
=
_mm_load_pd
(
&
q
[
ldq
+
2
]);
__m128d
x3
=
_mm_load_pd
(
&
q
[
ldq
+
4
]);
__m128d
x4
=
_mm_load_pd
(
&
q
[
ldq
+
6
]);
__m128d
x5
=
_mm_load_pd
(
&
q
[
ldq
+
8
]);
__m128d
x6
=
_mm_load_pd
(
&
q
[
ldq
+
10
]);
__m128d
h1
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
__m128d
h2
;
__m128d
q1
=
_mm_load_pd
(
q
);
__m128d
y1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
__m128d
q2
=
_mm_load_pd
(
&
q
[
2
]);
__m128d
y2
=
_mm_add_pd
(
q2
,
_mm_mul_pd
(
x2
,
h1
));
__m128d
q3
=
_mm_load_pd
(
&
q
[
4
]);
__m128d
y3
=
_mm_add_pd
(
q3
,
_mm_mul_pd
(
x3
,
h1
));
__m128d
q4
=
_mm_load_pd
(
&
q
[
6
]);
__m128d
y4
=
_mm_add_pd
(
q4
,
_mm_mul_pd
(
x4
,
h1
));
__m128d
q5
=
_mm_load_pd
(
&
q
[
8
]);
__m128d
y5
=
_mm_add_pd
(
q5
,
_mm_mul_pd
(
x5
,
h1
));
__m128d
q6
=
_mm_load_pd
(
&
q
[
10
]);
__m128d
y6
=
_mm_add_pd
(
q6
,
_mm_mul_pd
(
x6
,
h1
));
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
y1
=
_mm_add_pd
(
y1
,
_mm_mul_pd
(
q1
,
h2
));
q2
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
2
]);
x2
=
_mm_add_pd
(
x2
,
_mm_mul_pd
(
q2
,
h1
));
y2
=
_mm_add_pd
(
y2
,
_mm_mul_pd
(
q2
,
h2
));
q3
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
4
]);
x3
=
_mm_add_pd
(
x3
,
_mm_mul_pd
(
q3
,
h1
));
y3
=
_mm_add_pd
(
y3
,
_mm_mul_pd
(
q3
,
h2
));
q4
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
6
]);
x4
=
_mm_add_pd
(
x4
,
_mm_mul_pd
(
q4
,
h1
));
y4
=
_mm_add_pd
(
y4
,
_mm_mul_pd
(
q4
,
h2
));
q5
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
8
]);
x5
=
_mm_add_pd
(
x5
,
_mm_mul_pd
(
q5
,
h1
));
y5
=
_mm_add_pd
(
y5
,
_mm_mul_pd
(
q5
,
h2
));
q6
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
10
]);
x6
=
_mm_add_pd
(
x6
,
_mm_mul_pd
(
q6
,
h1
));
y6
=
_mm_add_pd
(
y6
,
_mm_mul_pd
(
q6
,
h2
));
}
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
x2
=
_mm_add_pd
(
x2
,
_mm_mul_pd
(
q2
,
h1
));
q3
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
4
]);
x3
=
_mm_add_pd
(
x3
,
_mm_mul_pd
(
q3
,
h1
));
q4
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
6
]);
x4
=
_mm_add_pd
(
x4
,
_mm_mul_pd
(
q4
,
h1
));
q5
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
8
]);
x5
=
_mm_add_pd
(
x5
,
_mm_mul_pd
(
q5
,
h1
));
q6
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
10
]);
x6
=
_mm_add_pd
(
x6
,
_mm_mul_pd
(
q6
,
h1
));
/////////////////////////////////////////////////////
// Rank-2 update of Q [12 x nb+1]
/////////////////////////////////////////////////////
__m128d
tau1
=
_mm_loaddup_pd
(
hh
);
__m128d
tau2
=
_mm_loaddup_pd
(
&
hh
[
ldh
]);
__m128d
vs
=
_mm_loaddup_pd
(
&
s
);
h1
=
_mm_xor_pd
(
tau1
,
sign
);
x1
=
_mm_mul_pd
(
x1
,
h1
);
x2
=
_mm_mul_pd
(
x2
,
h1
);
x3
=
_mm_mul_pd
(
x3
,
h1
);
x4
=
_mm_mul_pd
(
x4
,
h1
);
x5
=
_mm_mul_pd
(
x5
,
h1
);
x6
=
_mm_mul_pd
(
x6
,
h1
);
h1
=
_mm_xor_pd
(
tau2
,
sign
);
h2
=
_mm_mul_pd
(
h1
,
vs
);
y1
=
_mm_add_pd
(
_mm_mul_pd
(
y1
,
h1
),
_mm_mul_pd
(
x1
,
h2
));
y2
=
_mm_add_pd
(
_mm_mul_pd
(
y2
,
h1
),
_mm_mul_pd
(
x2
,
h2
));
y3
=
_mm_add_pd
(
_mm_mul_pd
(
y3
,
h1
),
_mm_mul_pd
(
x3
,
h2
));
y4
=
_mm_add_pd
(
_mm_mul_pd
(
y4
,
h1
),
_mm_mul_pd
(
x4
,
h2
));
y5
=
_mm_add_pd
(
_mm_mul_pd
(
y5
,
h1
),
_mm_mul_pd
(
x5
,
h2
));
y6
=
_mm_add_pd
(
_mm_mul_pd
(
y6
,
h1
),
_mm_mul_pd
(
x6
,
h2
));
q1
=
_mm_load_pd
(
q
);
q1
=
_mm_add_pd
(
q1
,
y1
);
_mm_store_pd
(
q
,
q1
);
q2
=
_mm_load_pd
(
&
q
[
2
]);
q2
=
_mm_add_pd
(
q2
,
y2
);
_mm_store_pd
(
&
q
[
2
],
q2
);
q3
=
_mm_load_pd
(
&
q
[
4
]);
q3
=
_mm_add_pd
(
q3
,
y3
);
_mm_store_pd
(
&
q
[
4
],
q3
);
q4
=
_mm_load_pd
(
&
q
[
6
]);
q4
=
_mm_add_pd
(
q4
,
y4
);
_mm_store_pd
(
&
q
[
6
],
q4
);
q5
=
_mm_load_pd
(
&
q
[
8
]);
q5
=
_mm_add_pd
(
q5
,
y5
);
_mm_store_pd
(
&
q
[
8
],
q5
);
q6
=
_mm_load_pd
(
&
q
[
10
]);
q6
=
_mm_add_pd
(
q6
,
y6
);
_mm_store_pd
(
&
q
[
10
],
q6
);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
q1
=
_mm_load_pd
(
&
q
[
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_add_pd
(
x1
,
_mm_mul_pd
(
y1
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[
ldq
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_add_pd
(
x2
,
_mm_mul_pd
(
y2
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
+
2
],
q2
);
q3
=
_mm_load_pd
(
&
q
[
ldq
+
4
]);
q3
=
_mm_add_pd
(
q3
,
_mm_add_pd
(
x3
,
_mm_mul_pd
(
y3
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
+
4
],
q3
);
q4
=
_mm_load_pd
(
&
q
[
ldq
+
6
]);
q4
=
_mm_add_pd
(
q4
,
_mm_add_pd
(
x4
,
_mm_mul_pd
(
y4
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
+
6
],
q4
);
q5
=
_mm_load_pd
(
&
q
[
ldq
+
8
]);
q5
=
_mm_add_pd
(
q5
,
_mm_add_pd
(
x5
,
_mm_mul_pd
(
y5
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
+
8
],
q5
);
q6
=
_mm_load_pd
(
&
q
[
ldq
+
10
]);
q6
=
_mm_add_pd
(
q6
,
_mm_add_pd
(
x6
,
_mm_mul_pd
(
y6
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
+
10
],
q6
);
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_add_pd
(
_mm_mul_pd
(
x1
,
h1
),
_mm_mul_pd
(
y1
,
h2
)));
_mm_store_pd
(
&
q
[
i
*
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_add_pd
(
_mm_mul_pd
(
x2
,
h1
),
_mm_mul_pd
(
y2
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
2
],
q2
);
q3
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
4
]);
q3
=
_mm_add_pd
(
q3
,
_mm_add_pd
(
_mm_mul_pd
(
x3
,
h1
),
_mm_mul_pd
(
y3
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
4
],
q3
);
q4
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
6
]);
q4
=
_mm_add_pd
(
q4
,
_mm_add_pd
(
_mm_mul_pd
(
x4
,
h1
),
_mm_mul_pd
(
y4
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
6
],
q4
);
q5
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
8
]);
q5
=
_mm_add_pd
(
q5
,
_mm_add_pd
(
_mm_mul_pd
(
x5
,
h1
),
_mm_mul_pd
(
y5
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
8
],
q5
);
q6
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
10
]);
q6
=
_mm_add_pd
(
q6
,
_mm_add_pd
(
_mm_mul_pd
(
x6
,
h1
),
_mm_mul_pd
(
y6
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
10
],
q6
);
}
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
_mm_store_pd
(
&
q
[
nb
*
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_mul_pd
(
x2
,
h1
));
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
2
],
q2
);
q3
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
4
]);
q3
=
_mm_add_pd
(
q3
,
_mm_mul_pd
(
x3
,
h1
));
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
4
],
q3
);
q4
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
6
]);
q4
=
_mm_add_pd
(
q4
,
_mm_mul_pd
(
x4
,
h1
));
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
6
],
q4
);
q5
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
8
]);
q5
=
_mm_add_pd
(
q5
,
_mm_mul_pd
(
x5
,
h1
));
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
8
],
q5
);
q6
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
10
]);
q6
=
_mm_add_pd
(
q6
,
_mm_mul_pd
(
x6
,
h1
));
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
10
],
q6
);
}
/**
* Unrolled kernel that computes
* 8 rows of Q simultaneously, a
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline
void
hh_trafo_kernel_8_SSE_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [8 x nb+1] * hh
// hh contains two householder vectors, with offset 1
/////////////////////////////////////////////////////
int
i
;
// Needed bit mask for floating point sign flip
__m128d
sign
=
(
__m128d
)
_mm_set1_epi64x
(
0x8000000000000000LL
);
__m128d
x1
=
_mm_load_pd
(
&
q
[
ldq
]);
__m128d
x2
=
_mm_load_pd
(
&
q
[
ldq
+
2
]);
__m128d
x3
=
_mm_load_pd
(
&
q
[
ldq
+
4
]);
__m128d
x4
=
_mm_load_pd
(
&
q
[
ldq
+
6
]);
__m128d
h1
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
__m128d
h2
;
__m128d
q1
=
_mm_load_pd
(
q
);
__m128d
y1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
__m128d
q2
=
_mm_load_pd
(
&
q
[
2
]);
__m128d
y2
=
_mm_add_pd
(
q2
,
_mm_mul_pd
(
x2
,
h1
));
__m128d
q3
=
_mm_load_pd
(
&
q
[
4
]);
__m128d
y3
=
_mm_add_pd
(
q3
,
_mm_mul_pd
(
x3
,
h1
));
__m128d
q4
=
_mm_load_pd
(
&
q
[
6
]);
__m128d
y4
=
_mm_add_pd
(
q4
,
_mm_mul_pd
(
x4
,
h1
));
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
y1
=
_mm_add_pd
(
y1
,
_mm_mul_pd
(
q1
,
h2
));
q2
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
2
]);
x2
=
_mm_add_pd
(
x2
,
_mm_mul_pd
(
q2
,
h1
));
y2
=
_mm_add_pd
(
y2
,
_mm_mul_pd
(
q2
,
h2
));
q3
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
4
]);
x3
=
_mm_add_pd
(
x3
,
_mm_mul_pd
(
q3
,
h1
));
y3
=
_mm_add_pd
(
y3
,
_mm_mul_pd
(
q3
,
h2
));
q4
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
6
]);
x4
=
_mm_add_pd
(
x4
,
_mm_mul_pd
(
q4
,
h1
));
y4
=
_mm_add_pd
(
y4
,
_mm_mul_pd
(
q4
,
h2
));
}
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
x2
=
_mm_add_pd
(
x2
,
_mm_mul_pd
(
q2
,
h1
));
q3
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
4
]);
x3
=
_mm_add_pd
(
x3
,
_mm_mul_pd
(
q3
,
h1
));
q4
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
6
]);
x4
=
_mm_add_pd
(
x4
,
_mm_mul_pd
(
q4
,
h1
));
/////////////////////////////////////////////////////
// Rank-2 update of Q [8 x nb+1]
/////////////////////////////////////////////////////
__m128d
tau1
=
_mm_loaddup_pd
(
hh
);
__m128d
tau2
=
_mm_loaddup_pd
(
&
hh
[
ldh
]);
__m128d
vs
=
_mm_loaddup_pd
(
&
s
);
h1
=
_mm_xor_pd
(
tau1
,
sign
);
x1
=
_mm_mul_pd
(
x1
,
h1
);
x2
=
_mm_mul_pd
(
x2
,
h1
);
x3
=
_mm_mul_pd
(
x3
,
h1
);
x4
=
_mm_mul_pd
(
x4
,
h1
);
h1
=
_mm_xor_pd
(
tau2
,
sign
);
h2
=
_mm_mul_pd
(
h1
,
vs
);
y1
=
_mm_add_pd
(
_mm_mul_pd
(
y1
,
h1
),
_mm_mul_pd
(
x1
,
h2
));
y2
=
_mm_add_pd
(
_mm_mul_pd
(
y2
,
h1
),
_mm_mul_pd
(
x2
,
h2
));
y3
=
_mm_add_pd
(
_mm_mul_pd
(
y3
,
h1
),
_mm_mul_pd
(
x3
,
h2
));
y4
=
_mm_add_pd
(
_mm_mul_pd
(
y4
,
h1
),
_mm_mul_pd
(
x4
,
h2
));
q1
=
_mm_load_pd
(
q
);
q1
=
_mm_add_pd
(
q1
,
y1
);
_mm_store_pd
(
q
,
q1
);
q2
=
_mm_load_pd
(
&
q
[
2
]);
q2
=
_mm_add_pd
(
q2
,
y2
);
_mm_store_pd
(
&
q
[
2
],
q2
);
q3
=
_mm_load_pd
(
&
q
[
4
]);
q3
=
_mm_add_pd
(
q3
,
y3
);
_mm_store_pd
(
&
q
[
4
],
q3
);
q4
=
_mm_load_pd
(
&
q
[
6
]);
q4
=
_mm_add_pd
(
q4
,
y4
);
_mm_store_pd
(
&
q
[
6
],
q4
);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
q1
=
_mm_load_pd
(
&
q
[
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_add_pd
(
x1
,
_mm_mul_pd
(
y1
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[
ldq
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_add_pd
(
x2
,
_mm_mul_pd
(
y2
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
+
2
],
q2
);
q3
=
_mm_load_pd
(
&
q
[
ldq
+
4
]);
q3
=
_mm_add_pd
(
q3
,
_mm_add_pd
(
x3
,
_mm_mul_pd
(
y3
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
+
4
],
q3
);
q4
=
_mm_load_pd
(
&
q
[
ldq
+
6
]);
q4
=
_mm_add_pd
(
q4
,
_mm_add_pd
(
x4
,
_mm_mul_pd
(
y4
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
+
6
],
q4
);
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_add_pd
(
_mm_mul_pd
(
x1
,
h1
),
_mm_mul_pd
(
y1
,
h2
)));
_mm_store_pd
(
&
q
[
i
*
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_add_pd
(
_mm_mul_pd
(
x2
,
h1
),
_mm_mul_pd
(
y2
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
2
],
q2
);
q3
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
4
]);
q3
=
_mm_add_pd
(
q3
,
_mm_add_pd
(
_mm_mul_pd
(
x3
,
h1
),
_mm_mul_pd
(
y3
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
4
],
q3
);
q4
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
6
]);
q4
=
_mm_add_pd
(
q4
,
_mm_add_pd
(
_mm_mul_pd
(
x4
,
h1
),
_mm_mul_pd
(
y4
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
6
],
q4
);
}
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
_mm_store_pd
(
&
q
[
nb
*
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_mul_pd
(
x2
,
h1
));
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
2
],
q2
);
q3
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
4
]);
q3
=
_mm_add_pd
(
q3
,
_mm_mul_pd
(
x3
,
h1
));
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
4
],
q3
);
q4
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
6
]);
q4
=
_mm_add_pd
(
q4
,
_mm_mul_pd
(
x4
,
h1
));
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
6
],
q4
);
}
/**
* Unrolled kernel that computes
* 4 rows of Q simultaneously, a
* matrix Vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline
void
hh_trafo_kernel_4_SSE_2hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
)
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [4 x nb+1] * hh
// hh contains two householder vectors, with offset 1
/////////////////////////////////////////////////////
int
i
;
// Needed bit mask for floating point sign flip
__m128d
sign
=
(
__m128d
)
_mm_set1_epi64x
(
0x8000000000000000LL
);
__m128d
x1
=
_mm_load_pd
(
&
q
[
ldq
]);
__m128d
x2
=
_mm_load_pd
(
&
q
[
ldq
+
2
]);
__m128d
h1
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
__m128d
h2
;
__m128d
q1
=
_mm_load_pd
(
q
);
__m128d
y1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
__m128d
q2
=
_mm_load_pd
(
&
q
[
2
]);
__m128d
y2
=
_mm_add_pd
(
q2
,
_mm_mul_pd
(
x2
,
h1
));
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
y1
=
_mm_add_pd
(
y1
,
_mm_mul_pd
(
q1
,
h2
));
q2
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
2
]);
x2
=
_mm_add_pd
(
x2
,
_mm_mul_pd
(
q2
,
h1
));
y2
=
_mm_add_pd
(
y2
,
_mm_mul_pd
(
q2
,
h2
));
}
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
x1
=
_mm_add_pd
(
x1
,
_mm_mul_pd
(
q1
,
h1
));
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
x2
=
_mm_add_pd
(
x2
,
_mm_mul_pd
(
q2
,
h1
));
/////////////////////////////////////////////////////
// Rank-2 update of Q [12 x nb+1]
/////////////////////////////////////////////////////
__m128d
tau1
=
_mm_loaddup_pd
(
hh
);
__m128d
tau2
=
_mm_loaddup_pd
(
&
hh
[
ldh
]);
__m128d
vs
=
_mm_loaddup_pd
(
&
s
);
h1
=
_mm_xor_pd
(
tau1
,
sign
);
x1
=
_mm_mul_pd
(
x1
,
h1
);
x2
=
_mm_mul_pd
(
x2
,
h1
);
h1
=
_mm_xor_pd
(
tau2
,
sign
);
h2
=
_mm_mul_pd
(
h1
,
vs
);
y1
=
_mm_add_pd
(
_mm_mul_pd
(
y1
,
h1
),
_mm_mul_pd
(
x1
,
h2
));
y2
=
_mm_add_pd
(
_mm_mul_pd
(
y2
,
h1
),
_mm_mul_pd
(
x2
,
h2
));
q1
=
_mm_load_pd
(
q
);
q1
=
_mm_add_pd
(
q1
,
y1
);
_mm_store_pd
(
q
,
q1
);
q2
=
_mm_load_pd
(
&
q
[
2
]);
q2
=
_mm_add_pd
(
q2
,
y2
);
_mm_store_pd
(
&
q
[
2
],
q2
);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
1
]);
q1
=
_mm_load_pd
(
&
q
[
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_add_pd
(
x1
,
_mm_mul_pd
(
y1
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[
ldq
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_add_pd
(
x2
,
_mm_mul_pd
(
y2
,
h2
)));
_mm_store_pd
(
&
q
[
ldq
+
2
],
q2
);
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
h1
=
_mm_loaddup_pd
(
&
hh
[
i
-
1
]);
h2
=
_mm_loaddup_pd
(
&
hh
[
ldh
+
i
]);
q1
=
_mm_load_pd
(
&
q
[
i
*
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_add_pd
(
_mm_mul_pd
(
x1
,
h1
),
_mm_mul_pd
(
y1
,
h2
)));
_mm_store_pd
(
&
q
[
i
*
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[(
i
*
ldq
)
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_add_pd
(
_mm_mul_pd
(
x2
,
h1
),
_mm_mul_pd
(
y2
,
h2
)));
_mm_store_pd
(
&
q
[(
i
*
ldq
)
+
2
],
q2
);
}
h1
=
_mm_loaddup_pd
(
&
hh
[
nb
-
1
]);
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "../../general/precision_macros.h"
#include "elpa2_kernels_real_sse_2hv_template.Xc"
#undef REALCASE
#undef DOUBLE_PRECISION
q1
=
_mm_load_pd
(
&
q
[
nb
*
ldq
]);
q1
=
_mm_add_pd
(
q1
,
_mm_mul_pd
(
x1
,
h1
));
_mm_store_pd
(
&
q
[
nb
*
ldq
],
q1
);
q2
=
_mm_load_pd
(
&
q
[(
nb
*
ldq
)
+
2
]);
q2
=
_mm_add_pd
(
q2
,
_mm_mul_pd
(
x2
,
h1
));
_mm_store_pd
(
&
q
[(
nb
*
ldq
)
+
2
],
q2
);
}
src/elpa2/kernels/elpa2_kernels_real_sse_2hv_single_precision.c
View file @
e54b7632
...
...
@@ -42,455 +42,14 @@
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek, MPCDF, based on the double precision case of A. Heinecke
//
#include "config-f90.h"
#include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif
//Forward declaration
__forceinline
void
hh_trafo_kernel_4_SSE_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
);
__forceinline
void
hh_trafo_kernel_8_SSE_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
);
__forceinline
void
hh_trafo_kernel_12_SSE_2hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s
);
void
double_hh_trafo_real_sse_2hv_single_
(
float
*
q
,
float
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
);
/*
!f>#ifdef HAVE_SSE_INTRINSICS