Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
elpa
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
15
Issues
15
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Environments
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
elpa
elpa
Commits
e04b11b2
Commit
e04b11b2
authored
May 06, 2019
by
Andreas Marek
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Unify 128bit kernels real block2, block4, and block6
parent
b6369e22
Changes
11
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
6060 additions
and
3293 deletions
+6060
-3293
Doxyfile.in
Doxyfile.in
+1
-1
Makefile.am
Makefile.am
+0
-1
src/elpa2/kernels/real_128bit_BLOCK_template.c
src/elpa2/kernels/real_128bit_BLOCK_template.c
+6044
-1120
src/elpa2/kernels/real_sparc64_2hv_double_precision.c
src/elpa2/kernels/real_sparc64_2hv_double_precision.c
+1
-1
src/elpa2/kernels/real_sparc64_2hv_single_precision.c
src/elpa2/kernels/real_sparc64_2hv_single_precision.c
+1
-1
src/elpa2/kernels/real_sparc64_4hv_single_precision.c
src/elpa2/kernels/real_sparc64_4hv_single_precision.c
+1
-1
src/elpa2/kernels/real_sparc64_6hv_double_precision.c
src/elpa2/kernels/real_sparc64_6hv_double_precision.c
+3
-1
src/elpa2/kernels/real_sparc64_6hv_single_precision.c
src/elpa2/kernels/real_sparc64_6hv_single_precision.c
+3
-1
src/elpa2/kernels/real_sse_6hv_double_precision.c
src/elpa2/kernels/real_sse_6hv_double_precision.c
+3
-1
src/elpa2/kernels/real_sse_6hv_single_precision.c
src/elpa2/kernels/real_sse_6hv_single_precision.c
+3
-1
src/elpa2/kernels/real_sse_6hv_template.c
src/elpa2/kernels/real_sse_6hv_template.c
+0
-2164
No files found.
Doxyfile.in
View file @
e04b11b2
...
...
@@ -900,7 +900,7 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_1hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_
sse_6hv
_template.c \
@top_srcdir@/src/elpa2/kernels/real_
128_BLOCK
_template.c \
@top_srcdir@/src/elpa2/kernels/complex_template.F90 \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_2hv_double_precision.c \
...
...
Makefile.am
View file @
e04b11b2
...
...
@@ -785,7 +785,6 @@ EXTRA_DIST = \
src/elpa2/kernels/real_vsx_4hv_template.c
\
src/elpa2/kernels/real_vsx_6hv_template.c
\
src/elpa2/kernels/real_128bit_BLOCK_template.c
\
src/elpa2/kernels/real_sse_6hv_template.c
\
src/elpa2/kernels/real_template.F90
\
src/elpa2/kernels/simple_template.F90
\
src/elpa2/kernels/simple_block4_template.F90
\
...
...
src/elpa2/kernels/real_128bit_BLOCK_template.c
View file @
e04b11b2
This source diff could not be displayed because it is too large. You can
view the blob
instead.
src/elpa2/kernels/real_sparc64_2hv_double_precision.c
View file @
e04b11b2
...
...
@@ -48,7 +48,7 @@
#define REALCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK2
#define BLOCK2
1
#include "../../general/precision_macros.h"
#include "real_128bit_BLOCK_template.c"
#undef REALCASE
...
...
src/elpa2/kernels/real_sparc64_2hv_single_precision.c
View file @
e04b11b2
...
...
@@ -48,7 +48,7 @@
#define REALCASE 1
#define SINGLE_PRECISION 1
#define BLOCK2
#define BLOCK2
1
#include "../../general/precision_macros.h"
#include "real_128bit_BLOCK_template.c"
#undef BLOCK2
...
...
src/elpa2/kernels/real_sparc64_4hv_single_precision.c
View file @
e04b11b2
...
...
@@ -48,7 +48,7 @@
#define REALCASE 1
#define SINGLE_PRECISION 1
#define BLOCK4
#define BLOCK4
1
#include "../../general/precision_macros.h"
#include "real_128bit_BLOCK_template.c"
#undef BLOCK4
...
...
src/elpa2/kernels/real_sparc64_6hv_double_precision.c
View file @
e04b11b2
...
...
@@ -48,8 +48,10 @@
#define REALCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK6 1
#include "../../general/precision_macros.h"
#include "real_
sse_6hv
_template.c"
#include "real_
128bit_BLOCK
_template.c"
#undef REALCASE
#undef BLOCK6
#undef DOUBLE_PRECISION
src/elpa2/kernels/real_sparc64_6hv_single_precision.c
View file @
e04b11b2
...
...
@@ -48,8 +48,10 @@
#define REALCASE 1
#define SINGLE_PRECISION 1
#define BLOCK6 1
#include "../../general/precision_macros.h"
#include "real_
sse_6hv
_template.c"
#include "real_
128bit_BLOCK
_template.c"
#undef REALCASE
#undef BLOCK6
#undef SINGLE_PRECISION
src/elpa2/kernels/real_sse_6hv_double_precision.c
View file @
e04b11b2
...
...
@@ -48,8 +48,10 @@
#define REALCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK6 1
#include "../../general/precision_macros.h"
#include "real_
sse_6hv
_template.c"
#include "real_
128bit_BLOCK
_template.c"
#undef REALCASE
#undef BLOCK6
#undef DOUBLE_PRECISION
src/elpa2/kernels/real_sse_6hv_single_precision.c
View file @
e04b11b2
...
...
@@ -48,8 +48,10 @@
#define REALCASE 1
#define SINGLE_PRECISION 1
#define BLOCK6 1
#include "../../general/precision_macros.h"
#include "real_
sse_6hv
_template.c"
#include "real_
128bit_BLOCK
_template.c"
#undef REALCASE
#undef BLOCK6
#undef SINGLE__PRECISION
src/elpa2/kernels/real_sse_6hv_template.c
deleted
100644 → 0
View file @
b6369e22
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de), based on Alexander Heinecke (alexander.heinecke@mytum.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#ifdef HAVE_SSE_INTRINSICS
#include <x86intrin.h>
#endif
#ifdef HAVE_SPARC64_SSE
#include <fjmfunc.h>
#include <emmintrin.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline)) static
#ifdef DOUBLE_PRECISION_REAL
#define offset 2
#define __SSE_DATATYPE __m128d
#define _SSE_LOAD _mm_load_pd
#define _SSE_ADD _mm_add_pd
#define _SSE_SUB _mm_sub_pd
#define _SSE_MUL _mm_mul_pd
#define _SSE_STORE _mm_store_pd
#define _SSE_SET _mm_set_pd
#define _SSE_SET1 _mm_set1_pd
#endif
#ifdef SINGLE_PRECISION_REAL
#define offset 4
#define __SSE_DATATYPE __m128
#define _SSE_LOAD _mm_load_ps
#define _SSE_ADD _mm_add_ps
#define _SSE_SUB _mm_sub_ps
#define _SSE_MUL _mm_mul_ps
#define _SSE_STORE _mm_store_ps
#define _SSE_SET _mm_set_ps
#define _SSE_SET1 _mm_set1_ps
#endif
#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
//Forward declaration
static
void
hh_trafo_kernel_2_SSE_6hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
*
scalarprods
);
static
void
hh_trafo_kernel_4_SSE_6hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
*
scalarprods
);
void
hexa_hh_trafo_real_sse_6hv_double
(
double
*
q
,
double
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
);
#endif
#ifdef SINGLE_PRECISION_REAL
static
void
hh_trafo_kernel_4_SSE_6hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
*
scalarprods
);
static
void
hh_trafo_kernel_8_SSE_6hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
*
scalarprods
);
void
hexa_hh_trafo_real_sse_6hv_single_
(
float
*
q
,
float
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
//Forward declaration
static
void
hh_trafo_kernel_2_SPARC64_6hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
*
scalarprods
);
static
void
hh_trafo_kernel_4_SPARC64_6hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
*
scalarprods
);
void
hexa_hh_trafo_real_sparc64_6hv_double
(
double
*
q
,
double
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
);
#endif
#ifdef SINGLE_PRECISION_REAL
static
void
hh_trafo_kernel_4_SPARC64_6hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
*
scalarprods
);
static
void
hh_trafo_kernel_8_SPARC64_6hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
*
scalarprods
);
void
hexa_hh_trafo_real_sparc64_6hv_single_
(
float
*
q
,
float
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
);
#endif
#endif
#ifdef DOUBLE_PRECISION_REAL
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine hexa_hh_trafo_real_sse_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_sse_6hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_double) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#ifdef HAVE_SPARC64_SSE
!f> interface
!f> subroutine hexa_hh_trafo_real_sparc64_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_sparc64_6hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_double) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef SINGLE_PRECISION_REAL
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine hexa_hh_trafo_real_sse_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_sse_6hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_float) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#ifdef HAVE_SPARC64_SSE
!f> interface
!f> subroutine hexa_hh_trafo_real_sparc64_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_sparc64_6hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_float) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
void
hexa_hh_trafo_real_sse_6hv_double
(
double
*
q
,
double
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
)
#endif
#ifdef SINGLE_PRECISION_REAL
void
hexa_hh_trafo_real_sse_6hv_single
(
float
*
q
,
float
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
)
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
void
hexa_hh_trafo_real_sparc64_6hv_double
(
double
*
q
,
double
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
)
#endif
#ifdef SINGLE_PRECISION_REAL
void
hexa_hh_trafo_real_sparc64_6hv_single
(
float
*
q
,
float
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
)
#endif
#endif
{
int
i
;
int
nb
=
*
pnb
;
int
nq
=
*
pldq
;
int
ldq
=
*
pldq
;
int
ldh
=
*
pldh
;
int
worked_on
;
worked_on
=
0
;
// calculating scalar products to compute
// 6 householder vectors simultaneously
#ifdef DOUBLE_PRECISION_REAL
double
scalarprods
[
15
];
#endif
#ifdef SINGLE_PRECISION_REAL
float
scalarprods
[
15
];
#endif
scalarprods
[
0
]
=
hh
[(
ldh
+
1
)];
// 1 = hh(2,2)
scalarprods
[
1
]
=
hh
[(
ldh
*
2
)
+
2
];
// 2 = hh(3,3)
scalarprods
[
2
]
=
hh
[(
ldh
*
2
)
+
1
];
// 3 = hh(2,3)
scalarprods
[
3
]
=
hh
[(
ldh
*
3
)
+
3
];
// 4 = hh(4,4)
scalarprods
[
4
]
=
hh
[(
ldh
*
3
)
+
2
];
// 5 = hh(3,4)
scalarprods
[
5
]
=
hh
[(
ldh
*
3
)
+
1
];
// 6 = hh(2,4)
scalarprods
[
6
]
=
hh
[(
ldh
*
4
)
+
4
];
// 7 = hh(5,5)
scalarprods
[
7
]
=
hh
[(
ldh
*
4
)
+
3
];
// 8 = hh(4,5)
scalarprods
[
8
]
=
hh
[(
ldh
*
4
)
+
2
];
// 9 = hh(3,5)
scalarprods
[
9
]
=
hh
[(
ldh
*
4
)
+
1
];
//10 = hh(2,5)
scalarprods
[
10
]
=
hh
[(
ldh
*
5
)
+
5
];
//11 = hh(6,6)
scalarprods
[
11
]
=
hh
[(
ldh
*
5
)
+
4
];
//12 = hh(5,6)
scalarprods
[
12
]
=
hh
[(
ldh
*
5
)
+
3
];
//13 = hh(4,6)
scalarprods
[
13
]
=
hh
[(
ldh
*
5
)
+
2
];
//14 = hh(3,6)
scalarprods
[
14
]
=
hh
[(
ldh
*
5
)
+
1
];
//15 = hh(2,6)
// calculate scalar product of first and fourth householder Vector
// loop counter = 2
scalarprods
[
0
]
+=
hh
[
1
]
*
hh
[(
2
+
ldh
)];
// 1 = 1 + hh(2,1) * hh(3,2)
scalarprods
[
2
]
+=
hh
[(
ldh
)
+
1
]
*
hh
[
2
+
(
ldh
*
2
)];
// 3 = 3 + hh(2,2) * hh(3,3)
scalarprods
[
5
]
+=
hh
[(
ldh
*
2
)
+
1
]
*
hh
[
2
+
(
ldh
*
3
)];
// 6 = 6 + hh(2,3) * hh(3,4)
scalarprods
[
9
]
+=
hh
[(
ldh
*
3
)
+
1
]
*
hh
[
2
+
(
ldh
*
4
)];
//10 =10 + hh(2,4) * hh(3,5)
scalarprods
[
14
]
+=
hh
[(
ldh
*
4
)
+
1
]
*
hh
[
2
+
(
ldh
*
5
)];
//15 =15 + hh(2,5) * hh(3,6)
// loop counter = 3
scalarprods
[
0
]
+=
hh
[
2
]
*
hh
[(
3
+
ldh
)];
// 1 = 1 + hh(3,1) * hh(4,2)
scalarprods
[
2
]
+=
hh
[(
ldh
)
+
2
]
*
hh
[
3
+
(
ldh
*
2
)];
// 3 = 3 + hh(3,2) * hh(4,3)
scalarprods
[
5
]
+=
hh
[(
ldh
*
2
)
+
2
]
*
hh
[
3
+
(
ldh
*
3
)];
// 6 = 6 + hh(3,3) * hh(4,4)
scalarprods
[
9
]
+=
hh
[(
ldh
*
3
)
+
2
]
*
hh
[
3
+
(
ldh
*
4
)];
//10 =10 + hh(3,4) * hh(4,5)
scalarprods
[
14
]
+=
hh
[(
ldh
*
4
)
+
2
]
*
hh
[
3
+
(
ldh
*
5
)];
//15 =15 + hh(3,5) * hh(4,6)
scalarprods
[
1
]
+=
hh
[
1
]
*
hh
[
3
+
(
ldh
*
2
)];
// 2 = 2 + hh(2,1) * hh(4,3)
scalarprods
[
4
]
+=
hh
[(
ldh
*
1
)
+
1
]
*
hh
[
3
+
(
ldh
*
3
)];
// 5 = 5 + hh(2,2) * hh(4,4)
scalarprods
[
8
]
+=
hh
[(
ldh
*
2
)
+
1
]
*
hh
[
3
+
(
ldh
*
4
)];
// 9 = 9 + hh(2,3) * hh(4,5)
scalarprods
[
13
]
+=
hh
[(
ldh
*
3
)
+
1
]
*
hh
[
3
+
(
ldh
*
5
)];
//14 =14 + hh(2,4) * hh(4,6)
// loop counter = 4
scalarprods
[
0
]
+=
hh
[
3
]
*
hh
[(
4
+
ldh
)];
// 1 = 1 + hh(4,1) * hh(5,2)
scalarprods
[
2
]
+=
hh
[(
ldh
)
+
3
]
*
hh
[
4
+
(
ldh
*
2
)];
// 3 = 3 + hh(4,2) * hh(5,3)
scalarprods
[
5
]
+=
hh
[(
ldh
*
2
)
+
3
]
*
hh
[
4
+
(
ldh
*
3
)];
// 6 = 6 + hh(4,3) * hh(5,4)
scalarprods
[
9
]
+=
hh
[(
ldh
*
3
)
+
3
]
*
hh
[
4
+
(
ldh
*
4
)];
//10 =10 + hh(4,4) * hh(5,5)
scalarprods
[
14
]
+=
hh
[(
ldh
*
4
)
+
3
]
*
hh
[
4
+
(
ldh
*
5
)];
//15 =15 + hh(4,5) * hh(5,6)
scalarprods
[
1
]
+=
hh
[
2
]
*
hh
[
4
+
(
ldh
*
2
)];
// 2 = 2 + hh(3,1) * hh(5,3)
scalarprods
[
4
]
+=
hh
[(
ldh
*
1
)
+
2
]
*
hh
[
4
+
(
ldh
*
3
)];
// 5 = 5 + hh(3,2) * hh(5,4)
scalarprods
[
8
]
+=
hh
[(
ldh
*
2
)
+
2
]
*
hh
[
4
+
(
ldh
*
4
)];
// 9 = 9 + hh(3,3) * hh(5,5)
scalarprods
[
13
]
+=
hh
[(
ldh
*
3
)
+
2
]
*
hh
[
4
+
(
ldh
*
5
)];
//14 =14 + hh(3,4) * hh(5,6)
scalarprods
[
3
]
+=
hh
[
1
]
*
hh
[
4
+
(
ldh
*
3
)];
// 4 = 4 + hh(2,1) * hh(5,4)
scalarprods
[
7
]
+=
hh
[(
ldh
)
+
1
]
*
hh
[
4
+
(
ldh
*
4
)];
// 8 = 8 + hh(2,2) * hh(5,5)
scalarprods
[
12
]
+=
hh
[(
ldh
*
2
)
+
1
]
*
hh
[
4
+
(
ldh
*
5
)];
//13 =13 + hh(2,3) * hh(5,6)
// loop counter = 5
scalarprods
[
0
]
+=
hh
[
4
]
*
hh
[(
5
+
ldh
)];
// 1 = 1 + hh(5,1) * hh(6,2)
scalarprods
[
2
]
+=
hh
[(
ldh
)
+
4
]
*
hh
[
5
+
(
ldh
*
2
)];
// 3 = 3 + hh(5,2) * hh(6,3)
scalarprods
[
5
]
+=
hh
[(
ldh
*
2
)
+
4
]
*
hh
[
5
+
(
ldh
*
3
)];
// 6 = 6 + hh(5,3) * hh(6,4)
scalarprods
[
9
]
+=
hh
[(
ldh
*
3
)
+
4
]
*
hh
[
5
+
(
ldh
*
4
)];
//10 =10 + hh(5,4) * hh(6,5)
scalarprods
[
14
]
+=
hh
[(
ldh
*
4
)
+
4
]
*
hh
[
5
+
(
ldh
*
5
)];
//15 =15 + hh(5,5) * hh(6,6)
scalarprods
[
1
]
+=
hh
[
3
]
*
hh
[
5
+
(
ldh
*
2
)];
// 2 = 2 + hh(4,1) * hh(6,3)
scalarprods
[
4
]
+=
hh
[(
ldh
*
1
)
+
3
]
*
hh
[
5
+
(
ldh
*
3
)];
// 5 = 5 + hh(4,2) * hh(6,4)
scalarprods
[
8
]
+=
hh
[(
ldh
*
2
)
+
3
]
*
hh
[
5
+
(
ldh
*
4
)];
// 9 = 9 + hh(4,3) * hh(6,5)
scalarprods
[
13
]
+=
hh
[(
ldh
*
3
)
+
3
]
*
hh
[
5
+
(
ldh
*
5
)];
//14 =14 + hh(4,4) * hh(6,6)
scalarprods
[
3
]
+=
hh
[
2
]
*
hh
[
5
+
(
ldh
*
3
)];
// 4 = 4 + hh(3,1) * hh(6,4)
scalarprods
[
7
]
+=
hh
[(
ldh
)
+
2
]
*
hh
[
5
+
(
ldh
*
4
)];
// 8 = 8 + hh(3,2) * hh(6,5)
scalarprods
[
12
]
+=
hh
[(
ldh
*
2
)
+
2
]
*
hh
[
5
+
(
ldh
*
5
)];
//13 =13 + hh(3,3) * hh(6,6)
scalarprods
[
6
]
+=
hh
[
1
]
*
hh
[
5
+
(
ldh
*
4
)];
// 7 = 7 + hh(2,1) * hh(6,5)
scalarprods
[
11
]
+=
hh
[(
ldh
)
+
1
]
*
hh
[
5
+
(
ldh
*
5
)];
//12 =12 + hh(2,2) * hh(6,6)
#pragma ivdep
for
(
i
=
6
;
i
<
nb
;
i
++
)
// do i = 7, nb
{
scalarprods
[
0
]
+=
hh
[
i
-
1
]
*
hh
[(
i
+
ldh
)];
// 1 = 1 + hh(i-1,1) * hh(i,2)
scalarprods
[
2
]
+=
hh
[(
ldh
)
+
i
-
1
]
*
hh
[
i
+
(
ldh
*
2
)];
// 3 = 3 + hh(i-1,2) * hh(i,3)
scalarprods
[
5
]
+=
hh
[(
ldh
*
2
)
+
i
-
1
]
*
hh
[
i
+
(
ldh
*
3
)];
// 6 = 6 + hh(i-1,3) * hh(i,4)
scalarprods
[
9
]
+=
hh
[(
ldh
*
3
)
+
i
-
1
]
*
hh
[
i
+
(
ldh
*
4
)];
//10 =10 + hh(i-1,4) * hh(i,5)
scalarprods
[
14
]
+=
hh
[(
ldh
*
4
)
+
i
-
1
]
*
hh
[
i
+
(
ldh
*
5
)];
//15 =15 + hh(i-1,5) * hh(i,6)
scalarprods
[
1
]
+=
hh
[
i
-
2
]
*
hh
[
i
+
(
ldh
*
2
)];
// 2 = 2 + hh(i-2,1) * hh(i,3)
scalarprods
[
4
]
+=
hh
[(
ldh
*
1
)
+
i
-
2
]
*
hh
[
i
+
(
ldh
*
3
)];
// 5 = 5 + hh(i-2,2) * hh(i,4)
scalarprods
[
8
]
+=
hh
[(
ldh
*
2
)
+
i
-
2
]
*
hh
[
i
+
(
ldh
*
4
)];
// 9 = 9 + hh(i-2,3) * hh(i,5)
scalarprods
[
13
]
+=
hh
[(
ldh
*
3
)
+
i
-
2
]
*
hh
[
i
+
(
ldh
*
5
)];
//14 =14 + hh(i-2,4) * hh(i,6)
scalarprods
[
3
]
+=
hh
[
i
-
3
]
*
hh
[
i
+
(
ldh
*
3
)];
// 4 = 4 + hh(i-3,1) * hh(i,4)
scalarprods
[
7
]
+=
hh
[(
ldh
)
+
i
-
3
]
*
hh
[
i
+
(
ldh
*
4
)];
// 8 = 8 + hh(i-3,2) * hh(i,5)
scalarprods
[
12
]
+=
hh
[(
ldh
*
2
)
+
i
-
3
]
*
hh
[
i
+
(
ldh
*
5
)];
//13 =13 + hh(i-3,3) * hh(i,6)
scalarprods
[
6
]
+=
hh
[
i
-
4
]
*
hh
[
i
+
(
ldh
*
4
)];
// 7 = 7 + hh(i-4,1) * hh(i,5)
scalarprods
[
11
]
+=
hh
[(
ldh
)
+
i
-
4
]
*
hh
[
i
+
(
ldh
*
5
)];
//12 =12 + hh(i-4,2) * hh(i,6)
scalarprods
[
10
]
+=
hh
[
i
-
5
]
*
hh
[
i
+
(
ldh
*
5
)];
//11 =11 + hh(i-5,1) * hh(i,6)
}
// Production level kernel calls with padding
#ifdef DOUBLE_PRECISION_REAL
for
(
i
=
0
;
i
<
nq
-
2
;
i
+=
4
)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_4_SSE_6hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
scalarprods
);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_4_SPARC64_6hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
scalarprods
);
#endif
worked_on
+=
4
;
}
#endif
#ifdef SINGLE_PRECISION_REAL
for
(
i
=
0
;
i
<
nq
-
4
;
i
+=
8
)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_8_SSE_6hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
scalarprods
);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_8_SPARC64_6hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
scalarprods
);
#endif
worked_on
+=
8
;
}
#endif
if
(
nq
==
i
)
{
return
;
}
#ifdef DOUBLE_PRECISION_REAL
if
(
nq
-
i
==
2
)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_2_SSE_6hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
scalarprods
);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_2_SPARC64_6hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
scalarprods
);
#endif
worked_on
+=
2
;
}
#endif
#ifdef SINGLE_PRECISION_REAL
if
(
nq
-
i
==
4
)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_4_SSE_6hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
scalarprods
);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_4_SPARC64_6hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
scalarprods
);
#endif
worked_on
+=
4
;
}
#endif
#ifdef WITH_DEBUG
if
(
worked_on
!=
nq
)
{
#ifdef HAVE_SSE_INTRINSICS
printf
(
"Error in real SSE BLOCK6 kernel
\n
"
);
#endif
#ifdef HAVE_SPARC64_SSE
printf
(
"Error in real SPARC64 BLOCK6 kernel
\n
"
);
#endif
abort
();
}
#endif
}
/**
* Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
* 4 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
* 8 rows of Q simultaneously, a
#endif
* matrix Vector product with two householder
* vectors + a rank 1 update is performed
*/
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_4_SSE_6hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
*
scalarprods
)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_8_SSE_6hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
*
scalarprods
)
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_4_SPARC64_6hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
*
scalarprods
)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_8_SPARC64_6hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
*
scalarprods
)
#endif
#endif
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [4 x nb+3] * hh
// hh contains four householder vectors
/////////////////////////////////////////////////////
int
i
;
__SSE_DATATYPE
a1_1
=
_SSE_LOAD
(
&
q
[
ldq
*
5
]);
// a_1_1 = q(1:nq,6)
__SSE_DATATYPE
a2_1
=
_SSE_LOAD
(
&
q
[
ldq
*
4
]);
// a_2_1 = q(1:nq,5)
__SSE_DATATYPE
a3_1
=
_SSE_LOAD
(
&
q
[
ldq
*
3
]);
// a_3_1 = q(1:nq,4)
__SSE_DATATYPE
a4_1
=
_SSE_LOAD
(
&
q
[
ldq
*
2
]);
// a_4_1 = q(1:nq,3)
__SSE_DATATYPE
a5_1
=
_SSE_LOAD
(
&
q
[
ldq
]);
// a_5_1 = q(1,nq,2)
__SSE_DATATYPE
a6_1
=
_SSE_LOAD
(
&
q
[
0
]);
// a_6_1 = q(1:nq,1)
#ifdef HAVE_SSE_INTRINSICS
__SSE_DATATYPE
h_6_5
=
_SSE_SET1
(
hh
[(
ldh
*
5
)
+
1
]);
// h_6_5 = hh(2,6)
__SSE_DATATYPE
h_6_4
=
_SSE_SET1
(
hh
[(
ldh
*
5
)
+
2
]);
// h_6_4 = hh(3,6)
__SSE_DATATYPE
h_6_3
=
_SSE_SET1
(
hh
[(
ldh
*
5
)
+
3
]);
// h_6_3 = hh(4,6)
__SSE_DATATYPE
h_6_2
=
_SSE_SET1
(
hh
[(
ldh
*
5
)
+
4
]);
// h_6_2 = hh(5,6)
__SSE_DATATYPE
h_6_1
=
_SSE_SET1
(
hh
[(
ldh
*
5
)
+
5
]);
// h_6_1 = hh(6,6)
#endif
#ifdef HAVE_SPARC64_SSE
__SSE_DATATYPE
h_6_5
=
_SSE_SET
(
hh
[(
ldh
*
5
)
+
1
],
hh
[(
ldh
*
5
)
+
1
]);
__SSE_DATATYPE
h_6_4
=
_SSE_SET
(
hh
[(
ldh
*
5
)
+
2
],
hh
[(
ldh
*
5
)
+
2
]);
__SSE_DATATYPE
h_6_3
=
_SSE_SET
(
hh
[(
ldh
*
5
)
+
3
],
hh
[(
ldh
*
5
)
+
3
]);
__SSE_DATATYPE
h_6_2
=
_SSE_SET
(
hh
[(
ldh
*
5
)
+
4
],
hh
[(
ldh
*
5
)
+
4
]);
__SSE_DATATYPE
h_6_1
=
_SSE_SET
(
hh
[(
ldh
*
5
)
+
5
],
hh
[(
ldh
*
5
)
+
5
]);
#endif
register
__SSE_DATATYPE
t1
=
_SSE_ADD
(
a6_1
,
_SSE_MUL
(
a5_1
,
h_6_5
));
// t1 = a_6_1 + a_5_1 * h_6_5
t1
=
_SSE_ADD
(
t1
,
_SSE_MUL
(
a4_1
,
h_6_4
));
// t1 = t1 + a_4_1 * h_6_4
t1
=
_SSE_ADD
(
t1
,
_SSE_MUL
(
a3_1
,
h_6_3
));
// t1 = t1 + a_3_1 * h_6_3
t1
=
_SSE_ADD
(
t1
,
_SSE_MUL
(
a2_1
,
h_6_2
));
// t1 = t1 + a_2_1 * h_6_2
t1
=
_SSE_ADD
(
t1
,
_SSE_MUL
(
a1_1
,
h_6_1
));
// t1 = t1 + a_1_1 * h_6_1
#ifdef HAVE_SSE_INTRINSICS
__SSE_DATATYPE
h_5_4
=
_SSE_SET1
(
hh
[(
ldh
*
4
)
+
1
]);
// h_5_4 = hh(2,5)
__SSE_DATATYPE
h_5_3
=
_SSE_SET1
(
hh
[(
ldh
*
4
)
+
2
]);
// h_5_3 = hh(3,5)
__SSE_DATATYPE
h_5_2
=
_SSE_SET1
(
hh
[(
ldh
*
4
)
+
3
]);
// h_5_2 = hh(4,5)
__SSE_DATATYPE
h_5_1
=
_SSE_SET1
(
hh
[(
ldh
*
4
)
+
4
]);
// h_5_1 = hh(5,5)
#endif
#ifdef HAVE_SPARC64_SSE
__SSE_DATATYPE
h_5_4
=
_SSE_SET
(
hh
[(
ldh
*
4
)
+
1
],
hh
[(
ldh
*
4
)
+
1
]);
__SSE_DATATYPE
h_5_3
=
_SSE_SET
(
hh
[(
ldh
*
4
)
+
2
],
hh
[(
ldh
*
4
)
+
2
]);
__SSE_DATATYPE
h_5_2
=
_SSE_SET
(
hh
[(
ldh
*
4
)
+
3
],
hh
[(
ldh
*
4
)
+
3
]);
__SSE_DATATYPE
h_5_1
=
_SSE_SET
(
hh
[(
ldh
*
4
)
+
4
],
hh
[(
ldh
*
4
)
+
4
]);
#endif
register
__SSE_DATATYPE
v1
=
_SSE_ADD
(
a5_1
,
_SSE_MUL
(
a4_1
,
h_5_4
));
// v1 = a_5_1 + a_4_1 * h_5_4
v1
=
_SSE_ADD
(
v1
,
_SSE_MUL
(
a3_1
,
h_5_3
));
// v1 = v1 + a_3_1 * h_5_3
v1
=
_SSE_ADD
(
v1
,
_SSE_MUL
(
a2_1
,
h_5_2
));
// v1 = v1 + a_2_1 * h_5_2
v1
=
_SSE_ADD
(
v1
,
_SSE_MUL
(
a1_1
,
h_5_1
));
// v1 = v1 + a_1_1 * h_5_1
#ifdef HAVE_SSE_INTRINSICS
__SSE_DATATYPE
h_4_3
=
_SSE_SET1
(
hh
[(
ldh
*
3
)
+
1
]);
// h_4_3 = hh(2,4)
__SSE_DATATYPE
h_4_2
=
_SSE_SET1
(
hh
[(
ldh
*
3
)
+
2
]);
// h_4_2 = hh(3,4)
__SSE_DATATYPE
h_4_1
=
_SSE_SET1
(
hh
[(
ldh
*
3
)
+
3
]);
// h_4_1 = hh(4,4)
#endif
#ifdef HAVE_SPARC64_SSE
__SSE_DATATYPE
h_4_3
=
_SSE_SET
(
hh
[(
ldh
*
3
)
+
1
],
hh
[(
ldh
*
3
)
+
1
]);
__SSE_DATATYPE
h_4_2
=
_SSE_SET
(
hh
[(
ldh
*
3
)
+
2
],
hh
[(
ldh
*
3
)
+
2
]);
__SSE_DATATYPE
h_4_1
=
_SSE_SET
(
hh
[(
ldh
*
3
)
+
3
],
hh
[(
ldh
*
3
)
+
3
]);
#endif
register
__SSE_DATATYPE
w1
=
_SSE_ADD
(
a4_1
,
_SSE_MUL
(
a3_1
,
h_4_3
));
// w1 = a_4_1 + a_3_1 * h_4_3
w1
=
_SSE_ADD
(
w1
,
_SSE_MUL
(
a2_1
,
h_4_2
));
// w1 = w1 + a_2_1 * h_4_2
w1
=
_SSE_ADD
(
w1
,
_SSE_MUL
(
a1_1
,
h_4_1
));
// w1 = w1 + a_1_1 * h_4_1
#ifdef HAVE_SSE_INTRINSICS
__SSE_DATATYPE
h_2_1
=
_SSE_SET1
(
hh
[
ldh
+
1
]);
// h_2_1 = hh(2,2)
__SSE_DATATYPE
h_3_2
=
_SSE_SET1
(
hh
[(
ldh
*
2
)
+
1
]);
// h_3_2 = hh(2,3)
__SSE_DATATYPE
h_3_1
=
_SSE_SET1
(
hh
[(
ldh
*
2
)
+
2
]);
// h_3_1 = hh(3,3)
#endif
#ifdef HAVE_SPARC64_SSE
__SSE_DATATYPE
h_2_1
=
_SSE_SET
(
hh
[
ldh
+
1
],
hh
[
ldh
+
1
]);
__SSE_DATATYPE
h_3_2
=
_SSE_SET
(
hh
[(
ldh
*
2
)
+
1
],
hh
[(
ldh
*
2
)
+
1
]);
__SSE_DATATYPE
h_3_1
=
_SSE_SET
(
hh
[(
ldh
*
2
)
+
2
],
hh
[(
ldh
*
2
)
+
2
]);
#endif
register
__SSE_DATATYPE
z1
=
_SSE_ADD
(
a3_1
,
_SSE_MUL
(
a2_1
,
h_3_2
));
// z1 = a_3_1 + a_2_1 * h_3_2
z1
=
_SSE_ADD
(
z1
,
_SSE_MUL
(
a1_1
,
h_3_1
));
// z1 = z1 + a_1_1 * h_3_1
register
__SSE_DATATYPE
y1
=
_SSE_ADD
(
a2_1
,
_SSE_MUL
(
a1_1
,
h_2_1
));
// y1 = a_2_1 + a_1_1 * h_2_1
register
__SSE_DATATYPE
x1
=
a1_1
;
// x1 = a_1_1