Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
elpa
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
10
Issues
10
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Environments
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
elpa
elpa
Commits
777d66f8
Commit
777d66f8
authored
Jun 03, 2019
by
Andreas Marek
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Start to unify complex SSE kernels
parent
4817663a
Changes
6
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
2180 additions
and
794 deletions
+2180
-794
Doxyfile.in
Doxyfile.in
+1
-1
Makefile.am
Makefile.am
+1
-1
src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c
...pa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c
+2168
-0
src/elpa2/kernels/complex_sse_1hv_double_precision.c
src/elpa2/kernels/complex_sse_1hv_double_precision.c
+5
-1
src/elpa2/kernels/complex_sse_1hv_single_precision.c
src/elpa2/kernels/complex_sse_1hv_single_precision.c
+5
-1
src/elpa2/kernels/complex_sse_1hv_template.c
src/elpa2/kernels/complex_sse_1hv_template.c
+0
-790
No files found.
Doxyfile.in
View file @
777d66f8
...
...
@@ -926,7 +926,7 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_1hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/mod_single_hh_trafo_real.F90 \
@top_srcdir@/src/elpa2/kernels/complex_
sse_1hv
_template.c \
@top_srcdir@/src/elpa2/kernels/complex_
128bit_256bit_512bit_BLOCK
_template.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_2hv_template.c \
...
...
Makefile.am
View file @
777d66f8
...
...
@@ -795,7 +795,7 @@ EXTRA_DIST = \
src/elpa2/kernels/complex_avx-avx2_2hv_template.c
\
src/elpa2/kernels/complex_avx512_1hv_template.c
\
src/elpa2/kernels/complex_avx512_2hv_template.c
\
src/elpa2/kernels/complex_
sse_1hv
_template.c
\
src/elpa2/kernels/complex_
128bit_256bit_512bit_BLOCK
_template.c
\
src/elpa2/kernels/complex_sse_2hv_template.c
\
src/elpa2/kernels/complex_template.F90
\
src/elpa2/kernels/real_vsx_4hv_template.c
\
...
...
src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c
0 → 100644
View file @
777d66f8
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek, MPCDF, based on the double precision case of A. Heinecke
//
#include "config-f90.h"
#define CONCAT_8ARGS(a, b, c, d, e, f, g, h) CONCAT2_8ARGS(a, b, c, d, e, f, g, h)
#define CONCAT2_8ARGS(a, b, c, d, e, f, g, h) a ## b ## c ## d ## e ## f ## g ## h
#define CONCAT_7ARGS(a, b, c, d, e, f, g) CONCAT2_7ARGS(a, b, c, d, e, f, g)
#define CONCAT2_7ARGS(a, b, c, d, e, f, g) a ## b ## c ## d ## e ## f ## g
#define CONCAT_6ARGS(a, b, c, d, e, f) CONCAT2_6ARGS(a, b, c, d, e, f)
#define CONCAT2_6ARGS(a, b, c, d, e, f) a ## b ## c ## d ## e ## f
#define CONCAT_5ARGS(a, b, c, d, e) CONCAT2_5ARGS(a, b, c, d, e)
#define CONCAT2_5ARGS(a, b, c, d, e) a ## b ## c ## d ## e
#define CONCAT_4ARGS(a, b, c, d) CONCAT2_4ARGS(a, b, c, d)
#define CONCAT2_4ARGS(a, b, c, d) a ## b ## c ## d
#define CONCAT_3ARGS(a, b, c) CONCAT2_3ARGS(a, b, c)
#define CONCAT2_3ARGS(a, b, c) a ## b ## c
//define instruction set numbers
#define SSE_128 128
#define NEON_ARCH64_128 1285
#if VEC_SET == SSE_128 || VEC_SET == 256 || VEC_SET == 512
#include <x86intrin.h>
#ifdef BLOCK2
#include <pmmintrin.h>
#endif
#endif
#include <complex.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef BLOCK2
#define PREFIX double
#define BLOCK 2
#endif
#ifdef BLOCK1
#define PREFIX single
#define BLOCK 1
#endif
#if VEC_SET == SSE_128
#define SIMD_SET SSE
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
#define offset 2
#define __SIMD_DATATYPE __m128d
#define _SIMD_LOAD _mm_load_pd
#define _SIMD_LOADU _mm_loadu_pd
#define _SIMD_STORE _mm_store_pd
#define _SIMD_MUL _mm_mul_pd
#define _SIMD_ADD _mm_add_pd
#define _SIMD_XOR _mm_xor_pd
#define _SIMD_MADDSUB _mm_maddsub_pd
#define _SIMD_ADDSUB _mm_addsub_pd
#define _SIMD_SHUFFLE _mm_shuffle_pd
#define _SHUFFLE _MM_SHUFFLE2(0,1)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define offset 4
#define __SIMD_DATATYPE __m128
#define _SIMD_LOAD _mm_load_ps
#define _SIMD_LOADU _mm_loadu_ps
#define _SIMD_STORE _mm_store_ps
#define _SIMD_MUL _mm_mul_ps
#define _SIMD_ADD _mm_add_ps
#define _SIMD_XOR _mm_xor_ps
#define _SIMD_MADDSUB _mm_maddsub_ps
#define _SIMD_ADDSUB _mm_addsub_ps
#define _SIMD_SHUFFLE _mm_shuffle_ps
#define _SHUFFLE 0xb1
#endif
#define __forceinline __attribute__((always_inline))
#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
#define WORD_LENGTH double
#define DATA_TYPE double complex
#define DATA_TYPE_PTR double complex*
#define DATA_TYPE_REAL double
#define DATA_TYPE_REAL_PTR double*
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define WORD_LENGTH single
#define DATA_TYPE float complex
#define DATA_TYPE_PTR float complex*
#define DATA_TYPE_REAL float
#define DATA_TYPE_REAL_PTR float*
#endif
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 6
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 12
#endif
#endif
/* VEC_SET == SSE_128 */
//Forward declaration
static
__forceinline
void
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
nb
,
int
ldq
#ifdef BLOCK1
);
#endif
#ifdef BLOCK2
,
int
ldh
,
DATA_TYPE
s
);
#endif
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 4
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 8
#endif
#endif
/* VEC_SET == SSE_128 */
static
__forceinline
void
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
nb
,
int
ldq
#ifdef BLOCK1
);
#endif
#ifdef BLOCK2
,
int
ldh
,
DATA_TYPE
s
);
#endif
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 2
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 4
#endif
#endif
/* VEC_SET == SSE_128 */
static
__forceinline
void
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
nb
,
int
ldq
#ifdef BLOCK1
);
#endif
#ifdef BLOCK2
,
int
ldh
,
DATA_TYPE
s
);
#endif
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine single_hh_trafo_complex_SSE_1hv_double(q, hh, pnb, pnq, pldq) &
!f> bind(C, name="single_hh_trafo_complex_SSE_1hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq
!f> ! complex(kind=c_double_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_double_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine single_hh_trafo_complex_SSE_1hv_single(q, hh, pnb, pnq, pldq) &
!f> bind(C, name="single_hh_trafo_complex_SSE_1hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq
!f> ! complex(kind=c_float_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_float_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
void
CONCAT_7ARGS
(
PREFIX
,
_hh_trafo_complex_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
#ifdef BLOCK1
)
#endif
#ifdef BLOCK2
,
int
*
pldh
)
#endif
{
int
i
,
worked_on
;
int
nb
=
*
pnb
;
int
nq
=
*
pldq
;
int
ldq
=
*
pldq
;
#ifdef BLOCK2
int
ldh
=
*
pldh
;
DATA_TYPE
s
=
conj
(
hh
[(
ldh
)
+
1
])
*
1
.
0
;
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
s
+=
hh
[
i
-
1
]
*
conj
(
hh
[(
i
+
ldh
)]);
}
#endif
worked_on
=
0
;
#ifdef BLOCK1
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 6
#define STEP_SIZE 6
#define UPPER_BOUND 4
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 12
#define STEP_SIZE 12
#define UPPER_BOUND 8
#endif
for
(
i
=
0
;
i
<
nq
-
UPPER_BOUND
;
i
+=
STEP_SIZE
)
{
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
&
q
[
i
],
hh
,
nb
,
ldq
);
worked_on
+=
ROW_LENGTH
;
}
if
(
nq
==
i
)
{
return
;
}
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 8
#endif
if
(
nq
-
i
==
ROW_LENGTH
)
{
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
&
q
[
i
],
hh
,
nb
,
ldq
);
worked_on
+=
ROW_LENGTH
;
}
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 2
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#endif
if
(
nq
-
i
==
ROW_LENGTH
)
{
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
&
q
[
i
],
hh
,
nb
,
ldq
);
worked_on
+=
ROW_LENGTH
;
}
#endif
/* BLOCK1 */
#ifdef BLOCK2
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#define STEP_SIZE 4
#define UPPER_BOUND 2
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 8
#define STEP_SIZE 8
#define UPPER_BOUND 4
#endif
for
(
i
=
0
;
i
<
nq
-
UPPER_BOUND
;
i
+=
STEP_SIZE
)
{
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
worked_on
+=
ROW_LENGTH
;
}
if
(
nq
==
i
)
{
return
;
}
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 2
#define STEP_SIZE 2
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#define STEP_SIZE 4
#endif
if
(
nq
-
i
==
ROW_LENGTH
)
{
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
worked_on
+=
ROW_LENGTH
;
}
#endif
/* BLOCK2 */
#ifdef WITH_DEBUG
if
(
worked_on
!=
nq
)
{
printf
(
"Error in complex SIMD_SET BLOCK BLOCK kernel %d %d
\n
"
,
worked_on
,
nq
);
abort
();
}
#endif
}
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 6
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 12
#endif
static
__forceinline
void
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
nb
,
int
ldq
#ifdef BLOCK1
)
#endif
#ifdef BLOCK2
,
int
ldh
,
DATA_TYPE
s
)
#endif
{
DATA_TYPE_REAL_PTR
q_dbl
=
(
DATA_TYPE_REAL_PTR
)
q
;
DATA_TYPE_REAL_PTR
hh_dbl
=
(
DATA_TYPE_REAL_PTR
)
hh
;
#ifdef BLOCK2
DATA_TYPE_REAL_PTR
s_dbl
=
(
DATA_TYPE_REAL_PTR
)(
&
s
);
#endif
__SIMD_DATATYPE
x1
,
x2
,
x3
,
x4
,
x5
,
x6
;
__SIMD_DATATYPE
q1
,
q2
,
q3
,
q4
,
q5
,
q6
;
#ifdef BLOCK2
__SIMD_DATATYPE
y1
,
y2
,
y3
,
y4
,
y5
,
y6
;
__SIMD_DATATYPE
h2_real
,
h2_imag
;
#endif
__SIMD_DATATYPE
h1_real
,
h1_imag
;
__SIMD_DATATYPE
tmp1
,
tmp2
,
tmp3
,
tmp4
,
tmp5
,
tmp6
;
int
i
=
0
;
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
__SIMD_DATATYPE
sign
=
(
__SIMD_DATATYPE
)
_mm_set_epi64x
(
0x8000000000000000
,
0x8000000000000000
);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
__SIMD_DATATYPE
sign
=
(
__SIMD_DATATYPE
)
_mm_set_epi32
(
0x80000000
,
0x80000000
,
0x80000000
,
0x80000000
);
#endif
#endif
/* VEC_SET == SSE_128 */
#ifdef BLOCK2
x1
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
ldq
)
+
0
]);
x2
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
ldq
)
+
offset
]);
x3
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
ldq
)
+
2
*
offset
]);
x4
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
ldq
)
+
3
*
offset
]);
x5
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
ldq
)
+
4
*
offset
]);
x6
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
ldq
)
+
5
*
offset
]);
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
h2_real
=
_mm_loaddup_pd
(
&
hh_dbl
[(
ldh
+
1
)
*
2
]);
h2_imag
=
_mm_loaddup_pd
(
&
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h2_real
=
_mm_moveldup_ps
(
_mm_castpd_ps
(
_mm_loaddup_pd
(
(
double
*
)(
&
hh_dbl
[(
ldh
+
1
)
*
2
])
)));
h2_imag
=
_mm_moveldup_ps
(
_mm_castpd_ps
(
_mm_loaddup_pd
(
(
double
*
)(
&
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
])
)));
#endif
#endif
/* VEC_SET == SSE_128 */
#ifndef __ELPA_USE_FMA__
// conjugate
h2_imag
=
_SIMD_XOR
(
h2_imag
,
sign
);
#endif
y1
=
_SIMD_LOAD
(
&
q_dbl
[
0
]);
y2
=
_SIMD_LOAD
(
&
q_dbl
[
offset
]);
y3
=
_SIMD_LOAD
(
&
q_dbl
[
2
*
offset
]);
y4
=
_SIMD_LOAD
(
&
q_dbl
[
3
*
offset
]);
y5
=
_SIMD_LOAD
(
&
q_dbl
[
4
*
offset
]);
y6
=
_SIMD_LOAD
(
&
q_dbl
[
5
*
offset
]);
tmp1
=
_SIMD_MUL
(
h2_imag
,
x1
);
#ifdef __ELPA_USE_FMA__
y1
=
_SIMD_ADD
(
y1
,
_mm_msubadd_pd
(
h2_real
,
x1
,
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#else
y1
=
_SIMD_ADD
(
y1
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h2_real
,
x1
),
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#endif
tmp2
=
_SIMD_MUL
(
h2_imag
,
x2
);
#ifdef __ELPA_USE_FMA__
y2
=
_SIMD_ADD
(
y2
,
_mm_msubadd_pd
(
h2_real
,
x2
,
_SIMD_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
#else
y2
=
_SIMD_ADD
(
y2
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h2_real
,
x2
),
_SIMD_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
#endif
tmp3
=
_SIMD_MUL
(
h2_imag
,
x3
);
#ifdef __ELPA_USE_FMA__
y3
=
_SIMD_ADD
(
y3
,
_mm_msubadd_pd
(
h2_real
,
x3
,
_SIMD_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#else
y3
=
_SIMD_ADD
(
y3
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h2_real
,
x3
),
_SIMD_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#endif
tmp4
=
_SIMD_MUL
(
h2_imag
,
x4
);
#ifdef __ELPA_USE_FMA__
y4
=
_SIMD_ADD
(
y4
,
_mm_msubadd_pd
(
h2_real
,
x4
,
_SIMD_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
#else
y4
=
_SIMD_ADD
(
y4
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h2_real
,
x4
),
_SIMD_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
#endif
tmp5
=
_SIMD_MUL
(
h2_imag
,
x5
);
#ifdef __ELPA_USE_FMA__
y5
=
_SIMD_ADD
(
y5
,
_mm_msubadd_pd
(
h2_real
,
x5
,
_SIMD_SHUFFLE
(
tmp5
,
tmp5
,
_SHUFFLE
)));
#else
y5
=
_SIMD_ADD
(
y5
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h2_real
,
x5
),
_SIMD_SHUFFLE
(
tmp5
,
tmp5
,
_SHUFFLE
)));
#endif
tmp6
=
_SIMD_MUL
(
h2_imag
,
x6
);
#ifdef __ELPA_USE_FMA__
y6
=
_SIMD_ADD
(
y6
,
_mm_msubadd_pd
(
h2_real
,
x6
,
_SIMD_SHUFFLE
(
tmp6
,
tmp6
,
_SHUFFLE
)));
#else
y6
=
_SIMD_ADD
(
y6
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h2_real
,
x6
),
_SIMD_SHUFFLE
(
tmp6
,
tmp6
,
_SHUFFLE
)));
#endif
#endif
/* BLOCK2 */
#ifdef BLOCK1
x1
=
_SIMD_LOAD
(
&
q_dbl
[
0
]);
x2
=
_SIMD_LOAD
(
&
q_dbl
[
offset
]);
x3
=
_SIMD_LOAD
(
&
q_dbl
[
2
*
offset
]);
x4
=
_SIMD_LOAD
(
&
q_dbl
[
3
*
offset
]);
x5
=
_SIMD_LOAD
(
&
q_dbl
[
4
*
offset
]);
x6
=
_SIMD_LOAD
(
&
q_dbl
[
5
*
offset
]);
#endif
for
(
i
=
BLOCK
;
i
<
nb
;
i
++
)
{
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
_mm_loaddup_pd
(
&
hh_dbl
[(
i
-
BLOCK
+
1
)
*
2
]);
h1_imag
=
_mm_loaddup_pd
(
&
hh_dbl
[((
i
-
BLOCK
+
1
)
*
2
)
+
1
]);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
_mm_moveldup_ps
(
_mm_castpd_ps
(
_mm_loaddup_pd
(
(
double
*
)(
&
hh_dbl
[(
i
-
BLOCK
+
1
)
*
2
])
)));
h1_imag
=
_mm_moveldup_ps
(
_mm_castpd_ps
(
_mm_loaddup_pd
(
(
double
*
)(
&
hh_dbl
[((
i
-
BLOCK
+
1
)
*
2
)
+
1
])
)));
#endif
#endif
/* VEC_SET == SSE_128 */
#ifndef __ELPA_USE_FMA__
// conjugate
h1_imag
=
_SIMD_XOR
(
h1_imag
,
sign
);
#endif
q1
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
0
]);
q2
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
offset
]);
q3
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
2
*
offset
]);
q4
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
3
*
offset
]);
q5
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
4
*
offset
]);
q6
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
5
*
offset
]);
tmp1
=
_SIMD_MUL
(
h1_imag
,
q1
);
#ifdef __ELPA_USE_FMA__
x1
=
_SIMD_ADD
(
x1
,
_mm_msubadd_pd
(
h1_real
,
q1
,
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#else
x1
=
_SIMD_ADD
(
x1
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h1_real
,
q1
),
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#endif
tmp2
=
_SIMD_MUL
(
h1_imag
,
q2
);
#ifdef __ELPA_USE_FMA__
x2
=
_SIMD_ADD
(
x2
,
_mm_msubadd_pd
(
h1_real
,
q2
,
_SIMD_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
#else
x2
=
_SIMD_ADD
(
x2
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h1_real
,
q2
),
_SIMD_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
#endif
tmp3
=
_SIMD_MUL
(
h1_imag
,
q3
);
#ifdef __ELPA_USE_FMA__
x3
=
_SIMD_ADD
(
x3
,
_mm_msubadd_pd
(
h1_real
,
q3
,
_SIMD_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#else
x3
=
_SIMD_ADD
(
x3
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h1_real
,
q3
),
_SIMD_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#endif
tmp4
=
_SIMD_MUL
(
h1_imag
,
q4
);
#ifdef __ELPA_USE_FMA__
x4
=
_SIMD_ADD
(
x4
,
_mm_msubadd_pd
(
h1_real
,
q4
,
_SIMD_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
#else
x4
=
_SIMD_ADD
(
x4
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h1_real
,
q4
),
_SIMD_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
#endif
tmp5
=
_SIMD_MUL
(
h1_imag
,
q5
);
#ifdef __ELPA_USE_FMA__
x5
=
_SIMD_ADD
(
x5
,
_mm_msubadd_pd
(
h1_real
,
q5
,
_SIMD_SHUFFLE
(
tmp5
,
tmp5
,
_SHUFFLE
)));
#else
x5
=
_SIMD_ADD
(
x5
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h1_real
,
q5
),
_SIMD_SHUFFLE
(
tmp5
,
tmp5
,
_SHUFFLE
)));
#endif
tmp6
=
_SIMD_MUL
(
h1_imag
,
q6
);
#ifdef __ELPA_USE_FMA__
x6
=
_SIMD_ADD
(
x6
,
_mm_msubadd_pd
(
h1_real
,
q6
,
_SIMD_SHUFFLE
(
tmp6
,
tmp6
,
_SHUFFLE
)));
#else
x6
=
_SIMD_ADD
(
x6
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h1_real
,
q6
),
_SIMD_SHUFFLE
(
tmp6
,
tmp6
,
_SHUFFLE
)));
#endif
#ifdef BLOCK2
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
h2_real
=
_mm_loaddup_pd
(
&
hh_dbl
[(
ldh
+
i
)
*
2
]);
h2_imag
=
_mm_loaddup_pd
(
&
hh_dbl
[((
ldh
+
i
)
*
2
)
+
1
]);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h2_real
=
_mm_moveldup_ps
(
_mm_castpd_ps
(
_mm_loaddup_pd
(
(
double
*
)(
&
hh_dbl
[(
ldh
+
i
)
*
2
])
)));
h2_imag
=
_mm_moveldup_ps
(
_mm_castpd_ps
(
_mm_loaddup_pd
(
(
double
*
)(
&
hh_dbl
[((
ldh
+
i
)
*
2
)
+
1
])
)));
#endif
#endif
/* VEC_SET == SSE_128 */
#ifndef __ELPA_USE_FMA__
// conjugate
h2_imag
=
_SIMD_XOR
(
h2_imag
,
sign
);
#endif
tmp1
=
_SIMD_MUL
(
h2_imag
,
q1
);
#ifdef __ELPA_USE_FMA__
y1
=
_SIMD_ADD
(
y1
,
_mm_msubadd_pd
(
h2_real
,
q1
,
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#else
y1
=
_SIMD_ADD
(
y1
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h2_real
,
q1
),
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#endif
tmp2
=
_SIMD_MUL
(
h2_imag
,
q2
);
#ifdef __ELPA_USE_FMA__