Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
On Thursday, 7th July from 1 to 3 pm there will be a maintenance with a short downtime of GitLab.
Open sidebar
elpa
elpa
Commits
8df8f226
Commit
8df8f226
authored
Jun 28, 2019
by
Andreas Marek
Browse files
Unify VSX kernels
parent
d5798c37
Changes
3
Hide whitespace changes
Inline
Side-by-side
src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c
View file @
8df8f226
...
...
@@ -906,6 +906,34 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
!f>#endif
*/
/*
!f>#ifdef HAVE_VSX_SSE
!f> interface
!f> subroutine quad_hh_trafo_real_VSX_4hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="quad_hh_trafo_real_VSX_4hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_double) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#ifdef HAVE_VSX_SSE
!f> interface
!f> subroutine quad_hh_trafo_real_VSX_4hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="quad_hh_trafo_real_VSX_4hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_float) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
...
...
@@ -1043,6 +1071,32 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
!f> end interface
!f>#endif
*/
/*
!f>#ifdef HAVE_VSX_SSE
!f> interface
!f> subroutine hexa_hh_trafo_real_VSX_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_VSX_6hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_double) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#ifdef HAVE_VSX_SSE
!f> interface
!f> subroutine hexa_hh_trafo_real_VSX_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_VSX_6hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_float) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
...
...
@@ -1836,7 +1890,6 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
__SIMD_DATATYPE sign = vec_spalts(-1.0);
#endif
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_REAL
__SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set1_epi64x(0x8000000000000000);
...
...
src/elpa2/kernels/real_vsx_4hv_template.c
deleted
100644 → 0
View file @
d5798c37
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#ifdef HAVE_VSX_SSE
#include <altivec.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#ifdef DOUBLE_PRECISION_REAL
#define offset 2
#define __SSE_DATATYPE __vector double
#define _SSE_LOAD (__vector double) vec_ld
#define _SSE_ADD vec_add
#define _SSE_SUB vec_sub
#define _SSE_MUL vec_mul
#define _SSE_STORE vec_st
#endif
#ifdef SINGLE_PRECISION_REAL
#define offset 4
#define __SSE_DATATYPE __vector float
#define _SSE_LOAD (__vector float) vec_ld
#define _SSE_ADD vec_add
#define _SSE_SUB vec_sub
#define _SSE_MUL vec_mul
#define _SSE_STORE vec_st
#endif
#define __forceinline __attribute__((always_inline)) static
//Forward declaration
#ifdef DOUBLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_2_VSX_4hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s_1_2
,
double
s_1_3
,
double
s_2_3
,
double
s_1_4
,
double
s_2_4
,
double
s_3_4
);
__forceinline
void
hh_trafo_kernel_4_VSX_4hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s_1_2
,
double
s_1_3
,
double
s_2_3
,
double
s_1_4
,
double
s_2_4
,
double
s_3_4
);
__forceinline
void
hh_trafo_kernel_6_VSX_4hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s_1_2
,
double
s_1_3
,
double
s_2_3
,
double
s_1_4
,
double
s_2_4
,
double
s_3_4
);
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_4_VSX_4hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s_1_2
,
float
s_1_3
,
float
s_2_3
,
float
s_1_4
,
float
s_2_4
,
float
s_3_4
);
__forceinline
void
hh_trafo_kernel_8_VSX_4hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s_1_2
,
float
s_1_3
,
float
s_2_3
,
float
s_1_4
,
float
s_2_4
,
float
s_3_4
);
__forceinline
void
hh_trafo_kernel_12_VSX_4hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s_1_2
,
float
s_1_3
,
float
s_2_3
,
float
s_1_4
,
float
s_2_4
,
float
s_3_4
);
#endif
#ifdef DOUBLE_PRECISION_REAL
void
quad_hh_trafo_real_vsx_4hv_double
(
double
*
q
,
double
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
);
#endif
#ifdef SINGLE_PRECISION_REAL
void
quad_hh_trafo_real_vsx_4hv_single
(
float
*
q
,
float
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
);
#endif
/*
!f>#ifdef HAVE_VSX_SSE
!f> interface
!f> subroutine quad_hh_trafo_real_vsx_4hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="quad_hh_trafo_real_vsx_4hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_double) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#ifdef HAVE_VSX_SSE
!f> interface
!f> subroutine quad_hh_trafo_real_vsx_4hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="quad_hh_trafo_real_vsx_4hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_float) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
#ifdef DOUBLE_PRECISION_REAL
void
quad_hh_trafo_real_vsx_4hv_double
(
double
*
q
,
double
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
)
#endif
#ifdef SINGLE_PRECISION_REAL
void
quad_hh_trafo_real_vsx_4hv_single
(
float
*
q
,
float
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
)
#endif
{
int
i
;
int
nb
=
*
pnb
;
int
nq
=
*
pldq
;
int
ldq
=
*
pldq
;
int
ldh
=
*
pldh
;
int
worked_on
;
worked_on
=
0
;
// calculating scalar products to compute
// 4 householder vectors simultaneously
#ifdef DOUBLE_PRECISION_REAL
double
s_1_2
=
hh
[(
ldh
)
+
1
];
double
s_1_3
=
hh
[(
ldh
*
2
)
+
2
];
double
s_2_3
=
hh
[(
ldh
*
2
)
+
1
];
double
s_1_4
=
hh
[(
ldh
*
3
)
+
3
];
double
s_2_4
=
hh
[(
ldh
*
3
)
+
2
];
double
s_3_4
=
hh
[(
ldh
*
3
)
+
1
];
#endif
#ifdef SINGLE_PRECISION_REAL
float
s_1_2
=
hh
[(
ldh
)
+
1
];
float
s_1_3
=
hh
[(
ldh
*
2
)
+
2
];
float
s_2_3
=
hh
[(
ldh
*
2
)
+
1
];
float
s_1_4
=
hh
[(
ldh
*
3
)
+
3
];
float
s_2_4
=
hh
[(
ldh
*
3
)
+
2
];
float
s_3_4
=
hh
[(
ldh
*
3
)
+
1
];
#endif
// calculate scalar product of first and fourth householder Vector
// loop counter = 2
s_1_2
+=
hh
[
2
-
1
]
*
hh
[(
2
+
ldh
)];
s_2_3
+=
hh
[(
ldh
)
+
2
-
1
]
*
hh
[
2
+
(
ldh
*
2
)];
s_3_4
+=
hh
[(
ldh
*
2
)
+
2
-
1
]
*
hh
[
2
+
(
ldh
*
3
)];
// loop counter = 3
s_1_2
+=
hh
[
3
-
1
]
*
hh
[(
3
+
ldh
)];
s_2_3
+=
hh
[(
ldh
)
+
3
-
1
]
*
hh
[
3
+
(
ldh
*
2
)];
s_3_4
+=
hh
[(
ldh
*
2
)
+
3
-
1
]
*
hh
[
3
+
(
ldh
*
3
)];
s_1_3
+=
hh
[
3
-
2
]
*
hh
[
3
+
(
ldh
*
2
)];
s_2_4
+=
hh
[(
ldh
*
1
)
+
3
-
2
]
*
hh
[
3
+
(
ldh
*
3
)];
#pragma ivdep
for
(
i
=
4
;
i
<
nb
;
i
++
)
{
s_1_2
+=
hh
[
i
-
1
]
*
hh
[(
i
+
ldh
)];
s_2_3
+=
hh
[(
ldh
)
+
i
-
1
]
*
hh
[
i
+
(
ldh
*
2
)];
s_3_4
+=
hh
[(
ldh
*
2
)
+
i
-
1
]
*
hh
[
i
+
(
ldh
*
3
)];
s_1_3
+=
hh
[
i
-
2
]
*
hh
[
i
+
(
ldh
*
2
)];
s_2_4
+=
hh
[(
ldh
*
1
)
+
i
-
2
]
*
hh
[
i
+
(
ldh
*
3
)];
s_1_4
+=
hh
[
i
-
3
]
*
hh
[
i
+
(
ldh
*
3
)];
}
// Production level kernel calls with padding
#ifdef DOUBLE_PRECISION_REAL
for
(
i
=
0
;
i
<
nq
-
4
;
i
+=
6
)
{
hh_trafo_kernel_6_VSX_4hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s_1_2
,
s_1_3
,
s_2_3
,
s_1_4
,
s_2_4
,
s_3_4
);
worked_on
+=
6
;
}
#endif
#ifdef SINGLE_PRECISION_REAL
for
(
i
=
0
;
i
<
nq
-
8
;
i
+=
12
)
{
hh_trafo_kernel_12_VSX_4hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s_1_2
,
s_1_3
,
s_2_3
,
s_1_4
,
s_2_4
,
s_3_4
);
worked_on
+=
12
;
}
#endif
if
(
nq
==
i
)
{
return
;
}
#ifdef DOUBLE_PRECISION_REAL
if
(
nq
-
i
==
4
)
{
hh_trafo_kernel_4_VSX_4hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s_1_2
,
s_1_3
,
s_2_3
,
s_1_4
,
s_2_4
,
s_3_4
);
worked_on
+=
4
;
}
#endif
#ifdef SINGLE_PRECISION_REAL
if
(
nq
-
i
==
8
)
{
hh_trafo_kernel_8_VSX_4hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s_1_2
,
s_1_3
,
s_2_3
,
s_1_4
,
s_2_4
,
s_3_4
);
worked_on
+=
8
;
}
#endif
#ifdef DOUBLE_PRECISION_REAL
if
(
nq
-
i
==
2
)
{
hh_trafo_kernel_2_VSX_4hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s_1_2
,
s_1_3
,
s_2_3
,
s_1_4
,
s_2_4
,
s_3_4
);
worked_on
+=
2
;
}
#endif
#ifdef SINGLE_PRECISION_REAL
if
(
nq
-
i
==
4
)
{
hh_trafo_kernel_4_VSX_4hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s_1_2
,
s_1_3
,
s_2_3
,
s_1_4
,
s_2_4
,
s_3_4
);
worked_on
+=
4
;
}
#endif
#ifdef WITH_DEBUG
if
(
worked_on
!=
nq
)
{
printf
(
"Error in real VSX BLOCK4 kernel
\n
"
);
abort
();
}
#endif
}
/**
* Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
* 6 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
* 12 rows of Q simultaneously, a
#endif
* matrix Vector product with two householder
* vectors + a rank 1 update is performed
*/
#ifdef DOUBLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_6_VSX_4hv_double
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s_1_2
,
double
s_1_3
,
double
s_2_3
,
double
s_1_4
,
double
s_2_4
,
double
s_3_4
)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline
void
hh_trafo_kernel_12_VSX_4hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
s_1_2
,
float
s_1_3
,
float
s_2_3
,
float
s_1_4
,
float
s_2_4
,
float
s_3_4
)
#endif
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [6 x nb+3] * hh
// hh contains four householder vectors
/////////////////////////////////////////////////////
int
i
;
__SSE_DATATYPE
a1_1
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
*
3
]);
__SSE_DATATYPE
a2_1
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
*
2
]);
__SSE_DATATYPE
a3_1
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
]);
__SSE_DATATYPE
a4_1
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
0
]);
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE
h_2_1
=
vec_splats
(
hh
[
ldh
+
1
]);
__SSE_DATATYPE
h_3_2
=
vec_splats
(
hh
[(
ldh
*
2
)
+
1
]);
__SSE_DATATYPE
h_3_1
=
vec_splats
(
hh
[(
ldh
*
2
)
+
2
]);
__SSE_DATATYPE
h_4_3
=
vec_splats
(
hh
[(
ldh
*
3
)
+
1
]);
__SSE_DATATYPE
h_4_2
=
vec_splats
(
hh
[(
ldh
*
3
)
+
2
]);
__SSE_DATATYPE
h_4_1
=
vec_splats
(
hh
[(
ldh
*
3
)
+
3
]);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE
h_2_1
=
vec_splats
(
hh
[
ldh
+
1
]
);
// h_2_1 contains four times hh[ldh+1]
__SSE_DATATYPE
h_3_2
=
vec_splats
(
hh
[(
ldh
*
2
)
+
1
]);
__SSE_DATATYPE
h_3_1
=
vec_splats
(
hh
[(
ldh
*
2
)
+
2
]);
__SSE_DATATYPE
h_4_3
=
vec_splats
(
hh
[(
ldh
*
3
)
+
1
]);
__SSE_DATATYPE
h_4_2
=
vec_splats
(
hh
[(
ldh
*
3
)
+
2
]);
__SSE_DATATYPE
h_4_1
=
vec_splats
(
hh
[(
ldh
*
3
)
+
3
]);
#endif
register
__SSE_DATATYPE
w1
=
_SSE_ADD
(
a4_1
,
_SSE_MUL
(
a3_1
,
h_4_3
));
w1
=
_SSE_ADD
(
w1
,
_SSE_MUL
(
a2_1
,
h_4_2
));
w1
=
_SSE_ADD
(
w1
,
_SSE_MUL
(
a1_1
,
h_4_1
));
register
__SSE_DATATYPE
z1
=
_SSE_ADD
(
a3_1
,
_SSE_MUL
(
a2_1
,
h_3_2
));
z1
=
_SSE_ADD
(
z1
,
_SSE_MUL
(
a1_1
,
h_3_1
));
register
__SSE_DATATYPE
y1
=
_SSE_ADD
(
a2_1
,
_SSE_MUL
(
a1_1
,
h_2_1
));
register
__SSE_DATATYPE
x1
=
a1_1
;
__SSE_DATATYPE
a1_2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
ldq
*
3
)
+
offset
]);
__SSE_DATATYPE
a2_2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
ldq
*
2
)
+
offset
]);
__SSE_DATATYPE
a3_2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
+
offset
]);
__SSE_DATATYPE
a4_2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
0
+
offset
]);
register
__SSE_DATATYPE
w2
=
_SSE_ADD
(
a4_2
,
_SSE_MUL
(
a3_2
,
h_4_3
));
w2
=
_SSE_ADD
(
w2
,
_SSE_MUL
(
a2_2
,
h_4_2
));
w2
=
_SSE_ADD
(
w2
,
_SSE_MUL
(
a1_2
,
h_4_1
));
register
__SSE_DATATYPE
z2
=
_SSE_ADD
(
a3_2
,
_SSE_MUL
(
a2_2
,
h_3_2
));
z2
=
_SSE_ADD
(
z2
,
_SSE_MUL
(
a1_2
,
h_3_1
));
register
__SSE_DATATYPE
y2
=
_SSE_ADD
(
a2_2
,
_SSE_MUL
(
a1_2
,
h_2_1
));
register
__SSE_DATATYPE
x2
=
a1_2
;
__SSE_DATATYPE
a1_3
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
ldq
*
3
)
+
2
*
offset
]);
__SSE_DATATYPE
a2_3
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
ldq
*
2
)
+
2
*
offset
]);
__SSE_DATATYPE
a3_3
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
ldq
+
2
*
offset
]);
__SSE_DATATYPE
a4_3
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
0
+
2
*
offset
]);
register
__SSE_DATATYPE
w3
=
_SSE_ADD
(
a4_3
,
_SSE_MUL
(
a3_3
,
h_4_3
));
w3
=
_SSE_ADD
(
w3
,
_SSE_MUL
(
a2_3
,
h_4_2
));
w3
=
_SSE_ADD
(
w3
,
_SSE_MUL
(
a1_3
,
h_4_1
));
register
__SSE_DATATYPE
z3
=
_SSE_ADD
(
a3_3
,
_SSE_MUL
(
a2_3
,
h_3_2
));
z3
=
_SSE_ADD
(
z3
,
_SSE_MUL
(
a1_3
,
h_3_1
));
register
__SSE_DATATYPE
y3
=
_SSE_ADD
(
a2_3
,
_SSE_MUL
(
a1_3
,
h_2_1
));
register
__SSE_DATATYPE
x3
=
a1_3
;
__SSE_DATATYPE
q1
;
__SSE_DATATYPE
q2
;
__SSE_DATATYPE
q3
;
__SSE_DATATYPE
h1
;
__SSE_DATATYPE
h2
;
__SSE_DATATYPE
h3
;
__SSE_DATATYPE
h4
;
for
(
i
=
4
;
i
<
nb
;
i
++
)
{
#ifdef DOUBLE_PRECISION_REAL
h1
=
vec_splats
(
hh
[
i
-
3
]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1
=
vec_splats
(
hh
[
i
-
3
]);
#endif
q1
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
i
*
ldq
]);
q2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
i
*
ldq
)
+
offset
]);
q3
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
i
*
ldq
)
+
2
*
offset
]);
x1
=
_SSE_ADD
(
x1
,
_SSE_MUL
(
q1
,
h1
));
x2
=
_SSE_ADD
(
x2
,
_SSE_MUL
(
q2
,
h1
));
x3
=
_SSE_ADD
(
x3
,
_SSE_MUL
(
q3
,
h1
));
#ifdef DOUBLE_PRECISION_REAL
h2
=
vec_splats
(
hh
[
ldh
+
i
-
2
]);
#endif
#ifdef SINGLE_PRECISION_REAL
h2
=
vec_splats
(
hh
[
ldh
+
i
-
2
]);
#endif
y1
=
_SSE_ADD
(
y1
,
_SSE_MUL
(
q1
,
h2
));
y2
=
_SSE_ADD
(
y2
,
_SSE_MUL
(
q2
,
h2
));
y3
=
_SSE_ADD
(
y3
,
_SSE_MUL
(
q3
,
h2
));
#ifdef DOUBLE_PRECISION_REAL
h3
=
vec_splats
(
hh
[(
ldh
*
2
)
+
i
-
1
]);
#endif
#ifdef SINGLE_PRECISION_REAL
h3
=
vec_splats
(
hh
[(
ldh
*
2
)
+
i
-
1
]);
#endif
z1
=
_SSE_ADD
(
z1
,
_SSE_MUL
(
q1
,
h3
));
z2
=
_SSE_ADD
(
z2
,
_SSE_MUL
(
q2
,
h3
));
z3
=
_SSE_ADD
(
z3
,
_SSE_MUL
(
q3
,
h3
));
#ifdef DOUBLE_PRECISION_REAL
h4
=
vec_splats
(
hh
[(
ldh
*
3
)
+
i
]);
#endif
#ifdef SINGLE_PRECISION_REAL
h4
=
vec_splats
(
hh
[(
ldh
*
3
)
+
i
]);
#endif
w1
=
_SSE_ADD
(
w1
,
_SSE_MUL
(
q1
,
h4
));
w2
=
_SSE_ADD
(
w2
,
_SSE_MUL
(
q2
,
h4
));
w3
=
_SSE_ADD
(
w3
,
_SSE_MUL
(
q3
,
h4
));
}
#ifdef DOUBLE_PRECISION_REAL
h1
=
vec_splats
(
hh
[
nb
-
3
]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1
=
vec_splats
(
hh
[
nb
-
3
]);
#endif
q1
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[
nb
*
ldq
]);
q2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
nb
*
ldq
)
+
offset
]);
q3
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
nb
*
ldq
)
+
2
*
offset
]);
x1
=
_SSE_ADD
(
x1
,
_SSE_MUL
(
q1
,
h1
));
x2
=
_SSE_ADD
(
x2
,
_SSE_MUL
(
q2
,
h1
));
x3
=
_SSE_ADD
(
x3
,
_SSE_MUL
(
q3
,
h1
));
#ifdef DOUBLE_PRECISION_REAL
h2
=
vec_splats
(
hh
[
ldh
+
nb
-
2
]);
#endif
#ifdef SINGLE_PRECISION_REAL
h2
=
vec_splats
(
hh
[
ldh
+
nb
-
2
]);
#endif
y1
=
_SSE_ADD
(
y1
,
_SSE_MUL
(
q1
,
h2
));
y2
=
_SSE_ADD
(
y2
,
_SSE_MUL
(
q2
,
h2
));
y3
=
_SSE_ADD
(
y3
,
_SSE_MUL
(
q3
,
h2
));
#ifdef DOUBLE_PRECISION_REAL
h3
=
vec_splats
(
hh
[(
ldh
*
2
)
+
nb
-
1
]);
#endif
#ifdef SINGLE_PRECISION_REAL
h3
=
vec_splats
(
hh
[(
ldh
*
2
)
+
nb
-
1
]);
#endif
z1
=
_SSE_ADD
(
z1
,
_SSE_MUL
(
q1
,
h3
));
z2
=
_SSE_ADD
(
z2
,
_SSE_MUL
(
q2
,
h3
));
z3
=
_SSE_ADD
(
z3
,
_SSE_MUL
(
q3
,
h3
));
#ifdef DOUBLE_PRECISION_REAL
h1
=
vec_splats
(
hh
[
nb
-
2
]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1
=
vec_splats
(
hh
[
nb
-
2
]);
#endif
q1
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
nb
+
1
)
*
ldq
]);
q2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[((
nb
+
1
)
*
ldq
)
+
offset
]);
q3
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[((
nb
+
1
)
*
ldq
)
+
2
*
offset
]);
x1
=
_SSE_ADD
(
x1
,
_SSE_MUL
(
q1
,
h1
));
x2
=
_SSE_ADD
(
x2
,
_SSE_MUL
(
q2
,
h1
));
x3
=
_SSE_ADD
(
x3
,
_SSE_MUL
(
q3
,
h1
));
#ifdef DOUBLE_PRECISION_REAL
h2
=
vec_splats
(
hh
[(
ldh
*
1
)
+
nb
-
1
]);
#endif
#ifdef SINGLE_PRECISION_REAL
h2
=
vec_splats
(
hh
[
ldh
+
nb
-
1
]);
#endif
y1
=
_SSE_ADD
(
y1
,
_SSE_MUL
(
q1
,
h2
));
y2
=
_SSE_ADD
(
y2
,
_SSE_MUL
(
q2
,
h2
));
y3
=
_SSE_ADD
(
y3
,
_SSE_MUL
(
q3
,
h2
));
#ifdef DOUBLE_PRECISION_REAL
h1
=
vec_splats
(
hh
[
nb
-
1
]);
#endif
#ifdef SINGLE_PRECISION_REAL
h1
=
vec_splats
(
hh
[
nb
-
1
]);
#endif
q1
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[(
nb
+
2
)
*
ldq
]);
q2
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[((
nb
+
2
)
*
ldq
)
+
offset
]);
q3
=
_SSE_LOAD
(
0
,
(
unsigned
long
int
*
)
&
q
[((
nb
+
2
)
*
ldq
)
+
2
*
offset
]);
x1
=
_SSE_ADD
(
x1
,
_SSE_MUL
(
q1
,
h1
));
x2
=
_SSE_ADD
(
x2
,
_SSE_MUL
(
q2
,
h1
));
x3
=
_SSE_ADD
(
x3
,
_SSE_MUL
(
q3
,
h1
));
/////////////////////////////////////////////////////
// Rank-1 update of Q [6 x nb+3]
/////////////////////////////////////////////////////
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE
tau1
=
vec_splats
(
hh
[
0
]);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE
tau1
=
vec_splats
(
hh
[
0
]);
#endif
h1
=
tau1
;
x1
=
_SSE_MUL
(
x1
,
h1
);
x2
=
_SSE_MUL
(
x2
,
h1
);
x3
=
_SSE_MUL
(
x3
,
h1
);
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE
tau2
=
vec_splats
(
hh
[
ldh
]);
__SSE_DATATYPE
vs_1_2
=
vec_splats
(
s_1_2
);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE
tau2
=
vec_splats
(
hh
[
ldh
]);
__SSE_DATATYPE
vs_1_2
=
vec_splats
(
s_1_2
);
#endif
h1
=
tau2
;
h2
=
_SSE_MUL
(
h1
,
vs_1_2
);
y1
=
_SSE_SUB
(
_SSE_MUL
(
y1
,
h1
),
_SSE_MUL
(
x1
,
h2
));
y2
=
_SSE_SUB
(
_SSE_MUL
(
y2
,
h1
),
_SSE_MUL
(
x2
,
h2
));
y3
=
_SSE_SUB
(
_SSE_MUL
(
y3
,
h1
),
_SSE_MUL
(
x3
,
h2
));
#ifdef DOUBLE_PRECISION_REAL
__SSE_DATATYPE
tau3
=
vec_splats
(
hh
[
ldh
*
2
]);
__SSE_DATATYPE
vs_1_3
=
vec_splats
(
s_1_3
);
__SSE_DATATYPE
vs_2_3
=
vec_splats
(
s_2_3
);
#endif
#ifdef SINGLE_PRECISION_REAL
__SSE_DATATYPE
tau3
=
vec_splats
(
hh
[
ldh
*
2
]);
__SSE_DATATYPE
vs_1_3
=
vec_splats
(
s_1_3
);
__SSE_DATATYPE
vs_2_3
=
vec_splats
(
s_2_3
);
#endif