Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Sebastian Ohlmann
elpa
Commits
b57f0f88
Commit
b57f0f88
authored
Jun 09, 2019
by
Andreas Marek
Browse files
Only AVX block1 kernel
parent
28a03265
Changes
5
Hide whitespace changes
Inline
Side-by-side
Makefile.am
View file @
b57f0f88
...
...
@@ -794,6 +794,7 @@ EXTRA_DIST = \
src/elpa2/kernels/complex_avx512_1hv_template.c
\
src/elpa2/kernels/complex_avx512_2hv_template.c
\
src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c
\
src/elpa2/kernels/complex_avx-avx2_2hv_template.c
\
src/elpa2/kernels/complex_template.F90
\
src/elpa2/kernels/real_vsx_4hv_template.c
\
src/elpa2/kernels/real_vsx_6hv_template.c
\
...
...
src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c
View file @
b57f0f88
...
...
@@ -178,10 +178,9 @@
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
#endif
#endif
/* HAVE_AVX2 */
#define _SIMD_FMADDSUB _mm256_FMADDSUB_pd
#define _SIMD_FMSUBADD _mm256_FMSUBADD_pd
#endif
/* HAVE_AVX2 */
#endif
/* DOUBLE_PRECISION_COMPLEX */
...
...
@@ -215,10 +214,9 @@
#define _mm256_FMSUBADD_ps(a,b,c) _mm256_fmsubadd_ps(a,b,c)
#endif
#endif
/* HAVE_AVX2 */
#define _SIMD_FMADDSUB _mm256_FMADDSUB_ps
#define _SIMD_FMSUBADD _mm256_FMSUBADD_ps
#endif
/* HAVE_AVX2 */
#endif
/* SINGLE_PRECISION_COMPLEX */
...
...
@@ -530,36 +528,6 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
!f>#endif
*/
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f> subroutine double_hh_trafo_complex_AVX_AVX2_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_AVX_AVX2_2hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> ! complex(kind=c_double_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_double_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f> subroutine double_hh_trafo_complex_AVX_AVX2_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_AVX_AVX2_2hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> ! complex(kind=c_float_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_float_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
void
CONCAT_7ARGS
(
PREFIX
,
_hh_trafo_complex_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
#ifdef BLOCK1
)
...
...
src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.c
View file @
b57f0f88
...
...
@@ -51,7 +51,7 @@
#define VEC_SET AVX_256
#define BLOCK2 1
#include "../../general/precision_macros.h"
#include "complex_
128bit_256bit_512bit_BLOCK
_template.c"
#include "complex_
avx-avx2_2hv
_template.c"
#undef VEC_SET
#undef BLOCK2
#undef DOUBLE_PRECISION
...
...
src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.c
View file @
b57f0f88
...
...
@@ -51,7 +51,7 @@
#define VEC_SET AVX_256
#define BLOCK2 1
#include "../../general/precision_macros.h"
#include "complex_
128bit_256bit_512bit_BLOCK
_template.c"
#include "complex_
avx-avx2_2hv
_template.c"
#undef VEC_SET
#undef BLOCK2
#undef SINGLE_PRECISION
...
...
src/elpa2/kernels/complex_avx-avx2_2hv_template.c
0 → 100644
View file @
b57f0f88
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <complex.h>
#include <x86intrin.h>
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline))
#ifdef DOUBLE_PRECISION_COMPLEX
#define offset 4
#define __AVX_DATATYPE __m256d
#define _AVX_LOAD _mm256_load_pd
#define _AVX_STORE _mm256_store_pd
#define _AVX_ADD _mm256_add_pd
#define _AVX_MUL _mm256_mul_pd
#define _AVX_ADDSUB _mm256_addsub_pd
#define _AVX_XOR _mm256_xor_pd
#define _AVX_BROADCAST _mm256_broadcast_sd
#define _AVX_SET1 _mm256_set1_pd
#define _AVX_SHUFFLE _mm256_shuffle_pd
#define _SHUFFLE 0x5
#define _CAST _mm256_castpd256_pd128
#ifdef HAVE_AVX2
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_msubadd_pd(a,b,c)
#endif
#ifdef __AVX2__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_fmaddsub_pd(a,b,c)
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
#endif
#define _AVX_FMADDSUB _mm256_FMADDSUB_pd
#define _AVX_FMSUBADD _mm256_FMSUBADD_pd
#endif
#endif
/* DOUBLE_PRECISION_COMPLEX */
#ifdef SINGLE_PRECISION_COMPLEX
#define offset 8
#define __AVX_DATATYPE __m256
#define _AVX_LOAD _mm256_load_ps
#define _AVX_STORE _mm256_store_ps
#define _AVX_ADD _mm256_add_ps
#define _AVX_MUL _mm256_mul_ps
#define _AVX_ADDSUB _mm256_addsub_ps
#define _AVX_XOR _mm256_xor_ps
#define _AVX_BROADCAST _mm256_broadcast_ss
#define _AVX_SET1 _mm256_set1_ps
#define _AVX_SHUFFLE _mm256_shuffle_ps
#define _SHUFFLE 0xb1
#define _CAST _mm256_castps256_ps128
#ifdef HAVE_AVX2
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_ps(a,b,c) _mm256_maddsub_ps(a,b,c)
#define _mm256_FMSUBADD_ps(a,b,c) _mm256_msubadd_ps(a,b,c)
#endif
#ifdef __AVX2__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_ps(a,b,c) _mm256_fmaddsub_ps(a,b,c)
#define _mm256_FMSUBADD_ps(a,b,c) _mm256_fmsubadd_ps(a,b,c)
#endif
#define _AVX_FMADDSUB _mm256_FMADDSUB_ps
#define _AVX_FMSUBADD _mm256_FMSUBADD_ps
#endif
#endif
/* SINGLE_PRECISION_COMPLEX */
#ifdef DOUBLE_PRECISION_COMPLEX
//Forward declaration
static
__forceinline
void
hh_trafo_complex_kernel_8_AVX_2hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
complex
s
);
static
__forceinline
void
hh_trafo_complex_kernel_6_AVX_2hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
complex
s
);
static
__forceinline
void
hh_trafo_complex_kernel_4_AVX_2hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
complex
s
);
static
__forceinline
void
hh_trafo_complex_kernel_2_AVX_2hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
complex
s
);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
//Forward declaration
static
__forceinline
void
hh_trafo_complex_kernel_16_AVX_2hv_single
(
float
complex
*
q
,
float
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
complex
s
,
float
complex
s1
);
static
__forceinline
void
hh_trafo_complex_kernel_12_AVX_2hv_single
(
float
complex
*
q
,
float
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
complex
s
,
float
complex
s1
);
static
__forceinline
void
hh_trafo_complex_kernel_8_AVX_2hv_single
(
float
complex
*
q
,
float
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
complex
s
,
float
complex
s1
);
static
__forceinline
void
hh_trafo_complex_kernel_4_AVX_2hv_single
(
float
complex
*
q
,
float
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
complex
s
,
float
complex
s1
);
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f> subroutine double_hh_trafo_complex_avx_avx2_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_avx_avx2_2hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> ! complex(kind=c_double_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_double_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef SINGLE_PRECISION_COMPLEX
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f> subroutine double_hh_trafo_complex_avx_avx2_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_avx_avx2_2hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> ! complex(kind=c_float_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_float_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
void
double_hh_trafo_complex_avx_avx2_2hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
void
double_hh_trafo_complex_avx_avx2_2hv_single
(
float
complex
*
q
,
float
complex
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
)
#endif
{
int
i
;
int
nb
=
*
pnb
;
int
nq
=
*
pldq
;
int
ldq
=
*
pldq
;
int
ldh
=
*
pldh
;
int
worked_on
;
worked_on
=
0
;
#ifdef DOUBLE_PRECISION_COMPLEX
double
complex
s
=
conj
(
hh
[(
ldh
)
+
1
])
*
1
.
0
;
#endif
#ifdef SINGLE_PRECISION_COMPLEX
float
complex
s
=
conj
(
hh
[(
ldh
)
+
1
])
*
1
.
0
f
;
#endif
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
s
+=
hh
[
i
-
1
]
*
conj
(
hh
[(
i
+
ldh
)]);
}
#ifdef DOUBLE_PRECISION_COMPLEX
for
(
i
=
0
;
i
<
nq
-
6
;
i
+=
8
)
{
hh_trafo_complex_kernel_8_AVX_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
worked_on
+=
8
;
}
#endif
#ifdef SINGLE_PRECISION_COMPLEX
for
(
i
=
0
;
i
<
nq
-
12
;
i
+=
16
)
{
hh_trafo_complex_kernel_16_AVX_2hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
,
s
);
worked_on
+=
16
;
}
#endif
if
(
nq
-
i
==
0
)
{
return
;
}
#ifdef DOUBLE_PRECISION_COMPLEX
if
(
nq
-
i
==
6
)
{
hh_trafo_complex_kernel_6_AVX_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
worked_on
+=
6
;
}
#endif
#ifdef SINGLE_PRECISION_COMPLEX
if
(
nq
-
i
==
12
)
{
hh_trafo_complex_kernel_12_AVX_2hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
,
s
);
worked_on
+=
12
;
}
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
if
(
nq
-
i
==
4
)
{
hh_trafo_complex_kernel_4_AVX_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
worked_on
+=
4
;
}
#endif
#ifdef SINGLE_PRECISION_COMPLEX
if
(
nq
-
i
==
8
)
{
hh_trafo_complex_kernel_8_AVX_2hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
,
s
);
worked_on
+=
8
;
}
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
if
(
nq
-
i
==
2
)
{
hh_trafo_complex_kernel_2_AVX_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
worked_on
+=
2
;
}
#endif
#ifdef SINGLE_PRECISION_COMPLEX
if
(
nq
-
i
==
4
)
{
hh_trafo_complex_kernel_4_AVX_2hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
,
s
);
worked_on
+=
4
;
}
#endif
#ifdef WITH_DEBUG
if
(
worked_on
!=
nq
)
{
printf
(
"Error in complex avx-avx2 BLOCK 2 kernel
\n
"
);
abort
();
}
#endif
}
#ifdef DOUBLE_PRECISION_COMPLEX
static
__forceinline
void
hh_trafo_complex_kernel_8_AVX_2hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
complex
s
)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static
__forceinline
void
hh_trafo_complex_kernel_16_AVX_2hv_single
(
float
complex
*
q
,
float
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
complex
s
,
float
complex
s1
)
#endif
{
#ifdef DOUBLE_PRECISION_COMPLEX
double
*
q_dbl
=
(
double
*
)
q
;
double
*
hh_dbl
=
(
double
*
)
hh
;
double
*
s_dbl
=
(
double
*
)(
&
s
);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
float
*
q_dbl
=
(
float
*
)
q
;
float
*
hh_dbl
=
(
float
*
)
hh
;
float
*
s_dbl
=
(
float
*
)(
&
s
);
#endif
__AVX_DATATYPE
x1
,
x2
,
x3
,
x4
;
__AVX_DATATYPE
y1
,
y2
,
y3
,
y4
;
__AVX_DATATYPE
q1
,
q2
,
q3
,
q4
;
__AVX_DATATYPE
h1_real
,
h1_imag
,
h2_real
,
h2_imag
;
__AVX_DATATYPE
tmp1
,
tmp2
,
tmp3
,
tmp4
;
int
i
=
0
;
#ifdef DOUBLE_PRECISION_COMPLEX
__AVX_DATATYPE
sign
=
(
__AVX_DATATYPE
)
_mm256_set_epi64x
(
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
__AVX_DATATYPE
sign
=
(
__AVX_DATATYPE
)
_mm256_set_epi32
(
0x80000000
,
0x80000000
,
0x80000000
,
0x80000000
,
0x80000000
,
0x80000000
,
0x80000000
,
0x80000000
);
#endif
x1
=
_AVX_LOAD
(
&
q_dbl
[(
2
*
ldq
)
+
0
]);
x2
=
_AVX_LOAD
(
&
q_dbl
[(
2
*
ldq
)
+
offset
]);
x3
=
_AVX_LOAD
(
&
q_dbl
[(
2
*
ldq
)
+
2
*
offset
]);
x4
=
_AVX_LOAD
(
&
q_dbl
[(
2
*
ldq
)
+
3
*
offset
]);
h2_real
=
_AVX_BROADCAST
(
&
hh_dbl
[(
ldh
+
1
)
*
2
]);
h2_imag
=
_AVX_BROADCAST
(
&
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
#ifndef __ELPA_USE_FMA__
// conjugate
h2_imag
=
_AVX_XOR
(
h2_imag
,
sign
);
#endif
y1
=
_AVX_LOAD
(
&
q_dbl
[
0
]);
y2
=
_AVX_LOAD
(
&
q_dbl
[
offset
]);
y3
=
_AVX_LOAD
(
&
q_dbl
[
2
*
offset
]);
y4
=
_AVX_LOAD
(
&
q_dbl
[
3
*
offset
]);
tmp1
=
_AVX_MUL
(
h2_imag
,
x1
);
#ifdef __ELPA_USE_FMA__
y1
=
_AVX_ADD
(
y1
,
_AVX_FMSUBADD
(
h2_real
,
x1
,
_AVX_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#else
y1
=
_AVX_ADD
(
y1
,
_AVX_ADDSUB
(
_AVX_MUL
(
h2_real
,
x1
),
_AVX_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#endif
tmp2
=
_AVX_MUL
(
h2_imag
,
x2
);
#ifdef __ELPA_USE_FMA__
y2
=
_AVX_ADD
(
y2
,
_AVX_FMSUBADD
(
h2_real
,
x2
,
_AVX_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
#else
y2
=
_AVX_ADD
(
y2
,
_AVX_ADDSUB
(
_AVX_MUL
(
h2_real
,
x2
),
_AVX_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
#endif
tmp3
=
_AVX_MUL
(
h2_imag
,
x3
);
#ifdef __ELPA_USE_FMA__
y3
=
_AVX_ADD
(
y3
,
_AVX_FMSUBADD
(
h2_real
,
x3
,
_AVX_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#else
y3
=
_AVX_ADD
(
y3
,
_AVX_ADDSUB
(
_AVX_MUL
(
h2_real
,
x3
),
_AVX_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#endif
tmp4
=
_AVX_MUL
(
h2_imag
,
x4
);
#ifdef __ELPA_USE_FMA__
y4
=
_AVX_ADD
(
y4
,
_AVX_FMSUBADD
(
h2_real
,
x4
,
_AVX_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
#else
y4
=
_AVX_ADD
(
y4
,
_AVX_ADDSUB
(
_AVX_MUL
(
h2_real
,
x4
),
_AVX_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
#endif
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
q1
=
_AVX_LOAD
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
0
]);
q2
=
_AVX_LOAD
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
offset
]);
q3
=
_AVX_LOAD
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
2
*
offset
]);
q4
=
_AVX_LOAD
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
3
*
offset
]);
h1_real
=
_AVX_BROADCAST
(
&
hh_dbl
[(
i
-
1
)
*
2
]);
h1_imag
=
_AVX_BROADCAST
(
&
hh_dbl
[((
i
-
1
)
*
2
)
+
1
]);
#ifndef __ELPA_USE_FMA__
// conjugate
h1_imag
=
_AVX_XOR
(
h1_imag
,
sign
);
#endif
tmp1
=
_AVX_MUL
(
h1_imag
,
q1
);
#ifdef __ELPA_USE_FMA__
x1
=
_AVX_ADD
(
x1
,
_AVX_FMSUBADD
(
h1_real
,
q1
,
_AVX_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#else
x1
=
_AVX_ADD
(
x1
,
_AVX_ADDSUB
(
_AVX_MUL
(
h1_real
,
q1
),
_AVX_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#endif
tmp2
=
_AVX_MUL
(
h1_imag
,
q2
);
#ifdef __ELPA_USE_FMA__
x2
=
_AVX_ADD
(
x2
,
_AVX_FMSUBADD
(
h1_real
,
q2
,
_AVX_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
#else
x2
=
_AVX_ADD
(
x2
,
_AVX_ADDSUB
(
_AVX_MUL
(
h1_real
,
q2
),
_AVX_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
#endif
tmp3
=
_AVX_MUL
(
h1_imag
,
q3
);
#ifdef __ELPA_USE_FMA__
x3
=
_AVX_ADD
(
x3
,
_AVX_FMSUBADD
(
h1_real
,
q3
,
_AVX_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#else
x3
=
_AVX_ADD
(
x3
,
_AVX_ADDSUB
(
_AVX_MUL
(
h1_real
,
q3
),
_AVX_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#endif
tmp4
=
_AVX_MUL
(
h1_imag
,
q4
);
#ifdef __ELPA_USE_FMA__
x4
=
_AVX_ADD
(
x4
,
_AVX_FMSUBADD
(
h1_real
,
q4
,
_AVX_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
#else
x4
=
_AVX_ADD
(
x4
,
_AVX_ADDSUB
(
_AVX_MUL
(
h1_real
,
q4
),
_AVX_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
#endif
h2_real
=
_AVX_BROADCAST
(
&
hh_dbl
[(
ldh
+
i
)
*
2
]);
h2_imag
=
_AVX_BROADCAST
(
&
hh_dbl
[((
ldh
+
i
)
*
2
)
+
1
]);
#ifndef __ELPA_USE_FMA__
// conjugate
h2_imag
=
_AVX_XOR
(
h2_imag
,
sign
);
#endif
tmp1
=
_AVX_MUL
(
h2_imag
,
q1
);
#ifdef __ELPA_USE_FMA__
y1
=
_AVX_ADD
(
y1
,
_AVX_FMSUBADD
(
h2_real
,
q1
,
_AVX_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#else
y1
=
_AVX_ADD
(
y1
,
_AVX_ADDSUB
(
_AVX_MUL
(
h2_real
,
q1
),
_AVX_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#endif
tmp2
=
_AVX_MUL
(
h2_imag
,
q2
);
#ifdef __ELPA_USE_FMA__
y2
=
_AVX_ADD
(
y2
,
_AVX_FMSUBADD
(
h2_real
,
q2
,
_AVX_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
#else
y2
=
_AVX_ADD
(
y2
,
_AVX_ADDSUB
(
_AVX_MUL
(
h2_real
,
q2
),
_AVX_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
#endif
tmp3
=
_AVX_MUL
(
h2_imag
,
q3
);
#ifdef __ELPA_USE_FMA__
y3
=
_AVX_ADD
(
y3
,
_AVX_FMSUBADD
(
h2_real
,
q3
,
_AVX_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#else
y3
=
_AVX_ADD
(
y3
,
_AVX_ADDSUB
(
_AVX_MUL
(
h2_real
,
q3
),
_AVX_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#endif
tmp4
=
_AVX_MUL
(
h2_imag
,
q4
);
#ifdef __ELPA_USE_FMA__
y4
=
_AVX_ADD
(
y4
,
_AVX_FMSUBADD
(
h2_real
,
q4
,
_AVX_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
#else
y4
=
_AVX_ADD
(
y4
,
_AVX_ADDSUB
(
_AVX_MUL
(
h2_real
,
q4
),
_AVX_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
#endif
}
h1_real
=
_AVX_BROADCAST
(
&
hh_dbl
[(
nb
-
1
)
*
2
]);
h1_imag
=
_AVX_BROADCAST
(
&
hh_dbl
[((
nb
-
1
)
*
2
)
+
1
]);
#ifndef __ELPA_USE_FMA__
// conjugate
h1_imag
=
_AVX_XOR
(
h1_imag
,
sign
);
#endif
q1
=
_AVX_LOAD
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
0
]);
q2
=
_AVX_LOAD
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
offset
]);
q3
=
_AVX_LOAD
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
2
*
offset
]);
q4
=
_AVX_LOAD
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
3
*
offset
]);
tmp1
=
_AVX_MUL
(
h1_imag
,
q1
);
#ifdef __ELPA_USE_FMA__
x1
=
_AVX_ADD
(
x1
,
_AVX_FMSUBADD
(
h1_real
,
q1
,
_AVX_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#else
x1
=
_AVX_ADD
(
x1
,
_AVX_ADDSUB
(
_AVX_MUL
(
h1_real
,
q1
),
_AVX_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#endif
tmp2
=
_AVX_MUL
(
h1_imag
,
q2
);
#ifdef __ELPA_USE_FMA__
x2
=
_AVX_ADD
(
x2
,
_AVX_FMSUBADD
(
h1_real
,
q2
,
_AVX_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
#else
x2
=
_AVX_ADD
(
x2
,
_AVX_ADDSUB
(
_AVX_MUL
(
h1_real
,
q2
),
_AVX_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
#endif
tmp3
=
_AVX_MUL
(
h1_imag
,
q3
);
#ifdef __ELPA_USE_FMA__
x3
=
_AVX_ADD
(
x3
,
_AVX_FMSUBADD
(
h1_real
,
q3
,
_AVX_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#else
x3
=
_AVX_ADD
(
x3
,
_AVX_ADDSUB
(
_AVX_MUL
(
h1_real
,
q3
),
_AVX_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#endif
tmp4
=
_AVX_MUL
(
h1_imag
,
q4
);
#ifdef __ELPA_USE_FMA__
x4
=
_AVX_ADD
(
x4
,
_AVX_FMSUBADD
(
h1_real
,
q4
,
_AVX_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
#else
x4
=
_AVX_ADD
(
x4
,
_AVX_ADDSUB
(
_AVX_MUL
(
h1_real
,
q4
),
_AVX_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
#endif
h1_real
=
_AVX_BROADCAST
(
&
hh_dbl
[
0
]);
h1_imag
=
_AVX_BROADCAST
(
&
hh_dbl
[
1
]);
h1_real
=
_AVX_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX_XOR
(
h1_imag
,
sign
);
tmp1
=
_AVX_MUL
(
h1_imag
,
x1
);
#ifdef __ELPA_USE_FMA__
x1
=
_AVX_FMADDSUB
(
h1_real
,
x1
,
_AVX_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
));
#else
x1
=
_AVX_ADDSUB
(
_AVX_MUL
(
h1_real
,
x1
),
_AVX_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
));
#endif
tmp2
=
_AVX_MUL
(
h1_imag
,
x2
);
#ifdef __ELPA_USE_FMA__
x2
=
_AVX_FMADDSUB
(
h1_real
,
x2
,
_AVX_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
));
#else
x2
=
_AVX_ADDSUB
(
_AVX_MUL
(
h1_real
,
x2
),
_AVX_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
));
#endif
tmp3
=
_AVX_MUL
(
h1_imag
,
x3
);
#ifdef __ELPA_USE_FMA__
x3
=
_AVX_FMADDSUB
(
h1_real
,
x3
,
_AVX_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
));
#else
x3
=
_AVX_ADDSUB
(
_AVX_MUL
(
h1_real
,
x3
),
_AVX_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
));
#endif
tmp4
=
_AVX_MUL
(
h1_imag
,
x4
);
#ifdef __ELPA_USE_FMA__
x4
=
_AVX_FMADDSUB
(
h1_real
,
x4
,
_AVX_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
));
#else
x4
=
_AVX_ADDSUB
(
_AVX_MUL
(
h1_real
,
x4
),
_AVX_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
));
#endif
h1_real
=
_AVX_BROADCAST
(
&
hh_dbl
[
ldh
*
2
]);
h1_imag
=
_AVX_BROADCAST
(
&
hh_dbl
[(
ldh
*
2
)
+
1
]);
h2_real
=
_AVX_BROADCAST
(
&
hh_dbl
[
ldh
*
2
]);
h2_imag
=
_AVX_BROADCAST
(
&
hh_dbl
[(
ldh
*
2
)
+
1
]);
h1_real
=
_AVX_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX_XOR
(
h1_imag
,
sign
);
h2_real
=
_AVX_XOR
(
h2_real
,
sign
);
h2_imag
=
_AVX_XOR
(
h2_imag
,
sign
);
#ifdef DOUBLE_PRECISION_COMPLEX