Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Sebastian Ohlmann
elpa
Commits
9493dde8
Commit
9493dde8
authored
Jun 04, 2019
by
Andreas Marek
Browse files
Start to unify complex SSE block1 block2 kernels
parent
777d66f8
Changes
6
Hide whitespace changes
Inline
Side-by-side
Doxyfile.in
View file @
9493dde8
...
...
@@ -929,7 +929,6 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_bgq.f90 \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_single_precision.c \
...
...
Makefile.am
View file @
9493dde8
...
...
@@ -796,7 +796,6 @@ EXTRA_DIST = \
src/elpa2/kernels/complex_avx512_1hv_template.c
\
src/elpa2/kernels/complex_avx512_2hv_template.c
\
src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c
\
src/elpa2/kernels/complex_sse_2hv_template.c
\
src/elpa2/kernels/complex_template.F90
\
src/elpa2/kernels/real_vsx_4hv_template.c
\
src/elpa2/kernels/real_vsx_6hv_template.c
\
...
...
src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c
View file @
9493dde8
...
...
@@ -74,6 +74,9 @@
#ifdef BLOCK2
#include
<pmmintrin.h>
#endif
#define __forceinline __attribute__((always_inline))
#endif
...
...
@@ -102,6 +105,7 @@
#define _SIMD_LOAD _mm_load_pd
#define _SIMD_LOADU _mm_loadu_pd
#define _SIMD_STORE _mm_store_pd
#define _SIMD_STOREU _mm_storeu_pd
#define _SIMD_MUL _mm_mul_pd
#define _SIMD_ADD _mm_add_pd
#define _SIMD_XOR _mm_xor_pd
...
...
@@ -116,6 +120,7 @@
#define _SIMD_LOAD _mm_load_ps
#define _SIMD_LOADU _mm_loadu_ps
#define _SIMD_STORE _mm_store_ps
#define _SIMD_STOREU _mm_storeu_ps
#define _SIMD_MUL _mm_mul_ps
#define _SIMD_ADD _mm_add_ps
#define _SIMD_XOR _mm_xor_ps
...
...
@@ -234,6 +239,37 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
!f>#endif
*/
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine double_hh_trafo_complex_SSE_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_SSE_2hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> ! complex(kind=c_double_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_double_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine double_hh_trafo_complex_SSE_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_SSE_2hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> ! complex(kind=c_float_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_float_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
void
CONCAT_7ARGS
(
PREFIX
,
_hh_trafo_complex_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
#ifdef BLOCK1
)
...
...
@@ -259,6 +295,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#endif
worked_on
=
0
;
#ifdef BLOCK1
#ifdef DOUBLE_PRECISION_COMPLEX
...
...
@@ -860,13 +897,24 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
q5
=
_SIMD_LOAD
(
&
q_dbl
[
4
*
offset
]);
q6
=
_SIMD_LOAD
(
&
q_dbl
[
5
*
offset
]);
#ifdef BLOCK1
q1
=
_SIMD_ADD
(
q1
,
x1
);
q2
=
_SIMD_ADD
(
q2
,
x2
);
q3
=
_SIMD_ADD
(
q3
,
x3
);
q4
=
_SIMD_ADD
(
q4
,
x4
);
q5
=
_SIMD_ADD
(
q5
,
x5
);
q6
=
_SIMD_ADD
(
q6
,
x6
);
#endif
#ifdef BLOCK2
q1
=
_SIMD_ADD
(
q1
,
y1
);
q2
=
_SIMD_ADD
(
q2
,
y2
);
q3
=
_SIMD_ADD
(
q3
,
y3
);
q4
=
_SIMD_ADD
(
q4
,
y4
);
q5
=
_SIMD_ADD
(
q5
,
y5
);
q6
=
_SIMD_ADD
(
q6
,
y6
);
#endif
_SIMD_STORE
(
&
q_dbl
[
0
],
q1
);
_SIMD_STORE
(
&
q_dbl
[
offset
],
q2
);
_SIMD_STORE
(
&
q_dbl
[
2
*
offset
],
q3
);
...
...
@@ -1209,6 +1257,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
y1
=
_SIMD_ADD
(
y1
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h2_real
,
x1
),
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#endif
tmp2
=
_SIMD_MUL
(
h2_imag
,
x2
);
#ifdef __ELPA_USE_FMA__
y2
=
_SIMD_ADD
(
y2
,
_mm_msubadd_pd
(
h2_real
,
x2
,
_SIMD_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
...
...
@@ -1222,6 +1271,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
y3
=
_SIMD_ADD
(
y3
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h2_real
,
x3
),
_SIMD_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#endif
tmp4
=
_SIMD_MUL
(
h2_imag
,
x4
);
#ifdef __ELPA_USE_FMA__
y4
=
_SIMD_ADD
(
y4
,
_mm_msubadd_pd
(
h2_real
,
x4
,
_SIMD_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
...
...
@@ -1362,6 +1412,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
x1
=
_SIMD_ADD
(
x1
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h1_real
,
q1
),
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#endif
tmp2
=
_SIMD_MUL
(
h1_imag
,
q2
);
#ifdef __ELPA_USE_FMA__
x2
=
_SIMD_ADD
(
x2
,
_mm_msubadd_pd
(
h1_real
,
q2
,
_SIMD_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
...
...
@@ -1375,6 +1426,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
x3
=
_SIMD_ADD
(
x3
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h1_real
,
q3
),
_SIMD_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#endif
tmp4
=
_SIMD_MUL
(
h1_imag
,
q4
);
#ifdef __ELPA_USE_FMA__
x4
=
_SIMD_ADD
(
x4
,
_mm_msubadd_pd
(
h1_real
,
q4
,
_SIMD_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
...
...
@@ -1405,6 +1457,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
x1
=
_SIMD_ADDSUB
(
_SIMD_MUL
(
h1_real
,
x1
),
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
));
#endif
tmp2
=
_SIMD_MUL
(
h1_imag
,
x2
);
#ifdef __ELPA_USE_FMA__
x2
=
_SIMD_MADDSUB
(
h1_real
,
x2
,
_SIMD_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
));
...
...
@@ -1418,6 +1471,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
x3
=
_SIMD_ADDSUB
(
_SIMD_MUL
(
h1_real
,
x3
),
_SIMD_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
));
#endif
tmp4
=
_SIMD_MUL
(
h1_imag
,
x4
);
#ifdef __ELPA_USE_FMA__
x4
=
_SIMD_MADDSUB
(
h1_real
,
x4
,
_SIMD_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
));
...
...
@@ -1478,6 +1532,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
y1
=
_SIMD_ADDSUB
(
_SIMD_MUL
(
h1_real
,
y1
),
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
));
#endif
tmp2
=
_SIMD_MUL
(
h1_imag
,
y2
);
#ifdef __ELPA_USE_FMA__
y2
=
_mm_maddsub_pd
(
h1_real
,
y2
,
_SIMD_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
));
...
...
@@ -1491,6 +1546,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
y3
=
_SIMD_ADDSUB
(
_SIMD_MUL
(
h1_real
,
y3
),
_SIMD_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
));
#endif
tmp4
=
_SIMD_MUL
(
h1_imag
,
y4
);
#ifdef __ELPA_USE_FMA__
y4
=
_mm_maddsub_pd
(
h1_real
,
y4
,
_SIMD_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
));
...
...
@@ -1504,6 +1560,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
y1
=
_SIMD_ADD
(
y1
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h2_real
,
x1
),
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#endif
tmp2
=
_SIMD_MUL
(
h2_imag
,
x2
);
#ifdef __ELPA_USE_FMA__
y2
=
_SIMD_ADD
(
y2
,
_mm_maddsub_pd
(
h2_real
,
x2
,
_SIMD_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
...
...
@@ -1517,6 +1574,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
y3
=
_SIMD_ADD
(
y3
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h2_real
,
x3
),
_SIMD_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#endif
tmp4
=
_SIMD_MUL
(
h2_imag
,
x4
);
#ifdef __ELPA_USE_FMA__
y4
=
_SIMD_ADD
(
y4
,
_mm_maddsub_pd
(
h2_real
,
x4
,
_SIMD_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
...
...
@@ -1531,10 +1589,19 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
q3
=
_SIMD_LOAD
(
&
q_dbl
[
2
*
offset
]);
q4
=
_SIMD_LOAD
(
&
q_dbl
[
3
*
offset
]);
#ifdef BLOCK1
q1
=
_SIMD_ADD
(
q1
,
x1
);
q2
=
_SIMD_ADD
(
q2
,
x2
);
q3
=
_SIMD_ADD
(
q3
,
x3
);
q4
=
_SIMD_ADD
(
q4
,
x4
);
#endif
#ifdef BLOCK2
q1
=
_SIMD_ADD
(
q1
,
y1
);
q2
=
_SIMD_ADD
(
q2
,
y2
);
q3
=
_SIMD_ADD
(
q3
,
y3
);
q4
=
_SIMD_ADD
(
q4
,
y4
);
#endif
_SIMD_STORE
(
&
q_dbl
[
0
],
q1
);
_SIMD_STORE
(
&
q_dbl
[
offset
],
q2
);
...
...
@@ -1569,6 +1636,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
q1
=
_SIMD_ADD
(
q1
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h2_real
,
y1
),
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#endif
tmp2
=
_SIMD_MUL
(
h2_imag
,
y2
);
#ifdef __ELPA_USE_FMA__
q2
=
_SIMD_ADD
(
q2
,
_mm_maddsub_pd
(
h2_real
,
y2
,
_SIMD_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
...
...
@@ -1582,6 +1650,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
q3
=
_SIMD_ADD
(
q3
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h2_real
,
y3
),
_SIMD_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#endif
tmp4
=
_SIMD_MUL
(
h2_imag
,
y4
);
#ifdef __ELPA_USE_FMA__
q4
=
_SIMD_ADD
(
q4
,
_mm_maddsub_pd
(
h2_real
,
y4
,
_SIMD_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
...
...
@@ -1614,6 +1683,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
q2
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
offset
]);
q3
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
2
*
offset
]);
q4
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
3
*
offset
]);
tmp1
=
_SIMD_MUL
(
h1_imag
,
x1
);
#ifdef __ELPA_USE_FMA__
...
...
@@ -2030,8 +2100,16 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
q1
=
_SIMD_LOAD
(
&
q_dbl
[
0
]);
q2
=
_SIMD_LOAD
(
&
q_dbl
[
offset
]);
#ifdef BLOCK1
q1
=
_SIMD_ADD
(
q1
,
x1
);
q2
=
_SIMD_ADD
(
q2
,
x2
);
#endif
#ifdef BLOCK2
q1
=
_SIMD_ADD
(
q1
,
y1
);
q2
=
_SIMD_ADD
(
q2
,
y2
);
#endif
_SIMD_STORE
(
&
q_dbl
[
0
],
q1
);
_SIMD_STORE
(
&
q_dbl
[
offset
],
q2
);
...
...
src/elpa2/kernels/complex_sse_2hv_double_precision.c
View file @
9493dde8
...
...
@@ -48,8 +48,12 @@
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#define VEC_SET SSE_128
#define BLOCK2 1
#include
"../../general/precision_macros.h"
#include
"complex_sse_2hv_template.c"
#include
"complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK2
#undef DOUBLE_PRECISION
#undef COMPLEXCASE
src/elpa2/kernels/complex_sse_2hv_single_precision.c
View file @
9493dde8
...
...
@@ -48,8 +48,12 @@
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#define VEC_SET SSE_128
#define BLOCK2 1
#include
"../../general/precision_macros.h"
#include
"complex_sse_2hv_template.c"
#include
"complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK2
#undef SINGLE_PRECISION
#undef COMPLEXCASE
src/elpa2/kernels/complex_sse_2hv_template.c
deleted
100644 → 0
View file @
777d66f8
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include
"config-f90.h"
#include
<complex.h>
#include
<x86intrin.h>
#include
<pmmintrin.h>
#define __forceinline __attribute__((always_inline))
#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
#define offset 2
#define __SSE_DATATYPE __m128d
#define _SSE_LOAD _mm_load_pd
#define _SSE_LOADU _mm_loadu_pd
#define _SSE_STORE _mm_store_pd
#define _SSE_STOREU _mm_storeu_pd
#define _SSE_ADD _mm_add_pd
#define _SSE_XOR _mm_xor_pd
#define _SSE_ADDSUB _mm_addsub_pd
#define _SSE_MUL _mm_mul_pd
#define _SSE_SHUFFLE _mm_shuffle_pd
#define _SHUFFLE _MM_SHUFFLE2(0,1)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define offset 4
#define __SSE_DATATYPE __m128
#define _SSE_LOAD _mm_load_ps
#define _SSE_LOADU _mm_loadu_ps
#define _SSE_STORE _mm_store_ps
#define _SSE_STOREU _mm_storeu_ps
#define _SSE_ADD _mm_add_ps
#define _SSE_XOR _mm_xor_ps
#define _SSE_ADDSUB _mm_addsub_ps
#define _SSE_MUL _mm_mul_ps
#define _SSE_SHUFFLE _mm_shuffle_ps
#define _SHUFFLE 0xb1
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
//Forward declaration
static
__forceinline
void
hh_trafo_complex_kernel_4_SSE_2hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
complex
s
);
#if 0
static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
#endif
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static
__forceinline
void
hh_trafo_complex_kernel_4_SSE_2hv_single
(
float
complex
*
q
,
float
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
complex
s
,
float
complex
s1
);
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine double_hh_trafo_complex_sse_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_sse_2hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> ! complex(kind=c_double_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_double_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef SINGLE_PRECISION_COMPLEX
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine double_hh_trafo_complex_sse_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_sse_2hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> ! complex(kind=c_float_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_float_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
void
double_hh_trafo_complex_sse_2hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
void
double_hh_trafo_complex_sse_2hv_single
(
float
complex
*
q
,
float
complex
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
)
#endif
{
int
i
;
int
nb
=
*
pnb
;
int
nq
=
*
pldq
;
int
ldq
=
*
pldq
;
int
ldh
=
*
pldh
;
#ifdef DOUBLE_PRECISION_COMPLEX
double
complex
s
=
conj
(
hh
[(
ldh
)
+
1
])
*
1
.
0
;
#endif
#ifdef SINGLE_PRECISION_COMPLEX
float
complex
s
=
conj
(
hh
[(
ldh
)
+
1
])
*
1
.
0
f
;
#endif
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
s
+=
hh
[
i
-
1
]
*
conj
(
hh
[(
i
+
ldh
)]);
}
for
(
i
=
0
;
i
<
nq
;
i
+=
4
)
{
#ifdef DOUBLE_PRECISION_COMPLEX
hh_trafo_complex_kernel_4_SSE_2hv_double
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
hh_trafo_complex_kernel_4_SSE_2hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
s
,
s
);
#endif
}
}
#ifdef DOUBLE_PRECISION_COMPLEX
static
__forceinline
void
hh_trafo_complex_kernel_4_SSE_2hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
complex
s
)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static
__forceinline
void
hh_trafo_complex_kernel_4_SSE_2hv_single
(
float
complex
*
q
,
float
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
complex
s
,
float
complex
s1
)
#endif
{
#ifdef DOUBLE_PRECISION_COMPLEX
double
*
q_dbl
=
(
double
*
)
q
;
double
*
hh_dbl
=
(
double
*
)
hh
;
double
*
s_dbl
=
(
double
*
)(
&
s
);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
float
*
q_dbl
=
(
float
*
)
q
;
float
*
hh_dbl
=
(
float
*
)
hh
;
float
*
s_dbl
=
(
float
*
)(
&
s
);
#endif
__SSE_DATATYPE
x1
,
x2
,
x3
,
x4
;
__SSE_DATATYPE
y1
,
y2
,
y3
,
y4
;
__SSE_DATATYPE
q1
,
q2
,
q3
,
q4
;
__SSE_DATATYPE
h1_real
,
h1_imag
,
h2_real
,
h2_imag
;
__SSE_DATATYPE
tmp1
,
tmp2
,
tmp3
,
tmp4
;
int
i
=
0
;
#ifdef DOUBLE_PRECISION_COMPLEX
__SSE_DATATYPE
sign
=
(
__SSE_DATATYPE
)
_mm_set_epi64x
(
0x8000000000000000
,
0x8000000000000000
);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
__SSE_DATATYPE
sign
=
(
__SSE_DATATYPE
)
_mm_set_epi32
(
0x80000000
,
0x80000000
,
0x80000000
,
0x80000000
);
#endif
x1
=
_SSE_LOAD
(
&
q_dbl
[(
2
*
ldq
)
+
0
]);
x2
=
_SSE_LOAD
(
&
q_dbl
[(
2
*
ldq
)
+
offset
]);
#ifdef DOUBLE_PRECISION_COMPLEX
x3
=
_SSE_LOAD
(
&
q_dbl
[(
2
*
ldq
)
+
2
*
offset
]);
x4
=
_SSE_LOAD
(
&
q_dbl
[(
2
*
ldq
)
+
3
*
offset
]);
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
h2_real
=
_mm_loaddup_pd
(
&
hh_dbl
[(
ldh
+
1
)
*
2
]);
h2_imag
=
_mm_loaddup_pd
(
&
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
]);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h2_real
=
_mm_moveldup_ps
(
_mm_castpd_ps
(
_mm_loaddup_pd
(
(
double
*
)(
&
hh_dbl
[(
ldh
+
1
)
*
2
])
)));
h2_imag
=
_mm_moveldup_ps
(
_mm_castpd_ps
(
_mm_loaddup_pd
(
(
double
*
)(
&
hh_dbl
[((
ldh
+
1
)
*
2
)
+
1
])
)));
#endif
#ifndef __ELPA_USE_FMA__
// conjugate
h2_imag
=
_SSE_XOR
(
h2_imag
,
sign
);
#endif
y1
=
_SSE_LOAD
(
&
q_dbl
[
0
]);
y2
=
_SSE_LOAD
(
&
q_dbl
[
offset
]);
#ifdef DOUBLE_PRECISION_COMPLEX
y3
=
_SSE_LOAD
(
&
q_dbl
[
2
*
offset
]);
y4
=
_SSE_LOAD
(
&
q_dbl
[
3
*
offset
]);
#endif
tmp1
=
_SSE_MUL
(
h2_imag
,
x1
);
#ifdef __ELPA_USE_FMA__
y1
=
_SSE_ADD
(
y1
,
_mm_msubadd_pd
(
h2_real
,
x1
,
_SSE_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#else
y1
=
_SSE_ADD
(
y1
,
_SSE_ADDSUB
(
_SSE_MUL
(
h2_real
,
x1
),
_SSE_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#endif
tmp2
=
_SSE_MUL
(
h2_imag
,
x2
);
#ifdef __ELPA_USE_FMA__
y2
=
_SSE_ADD
(
y2
,
_mm_msubadd_pd
(
h2_real
,
x2
,
_SSE_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
#else
y2
=
_SSE_ADD
(
y2
,
_SSE_ADDSUB
(
_SSE_MUL
(
h2_real
,
x2
),
_SSE_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp3
=
_SSE_MUL
(
h2_imag
,
x3
);
#ifdef __ELPA_USE_FMA__
y3
=
_SSE_ADD
(
y3
,
_mm_msubadd_pd
(
h2_real
,
x3
,
_SSE_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#else
y3
=
_SSE_ADD
(
y3
,
_SSE_ADDSUB
(
_SSE_MUL
(
h2_real
,
x3
),
_SSE_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#endif
tmp4
=
_SSE_MUL
(
h2_imag
,
x4
);
#ifdef __ELPA_USE_FMA__
y4
=
_SSE_ADD
(
y4
,
_mm_msubadd_pd
(
h2_real
,
x4
,
_SSE_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
#else
y4
=
_SSE_ADD
(
y4
,
_SSE_ADDSUB
(
_SSE_MUL
(
h2_real
,
x4
),
_SSE_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
#endif
#endif
/* DOUBLE_PRECISION_COMPLEX */
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
q1
=
_SSE_LOAD
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
0
]);
q2
=
_SSE_LOAD
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
offset
]);
#ifdef DOUBLE_PRECISION_COMPLEX
q3
=
_SSE_LOAD
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
2
*
offset
]);
q4
=
_SSE_LOAD
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
3
*
offset
]);
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
_mm_loaddup_pd
(
&
hh_dbl
[(
i
-
1
)
*
2
]);
h1_imag
=
_mm_loaddup_pd
(
&
hh_dbl
[((
i
-
1
)
*
2
)
+
1
]);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
_mm_moveldup_ps
(
_mm_castpd_ps
(
_mm_loaddup_pd
(
(
double
*
)(
&
hh_dbl
[(
i
-
1
)
*
2
])
)));
h1_imag
=
_mm_moveldup_ps
(
_mm_castpd_ps
(
_mm_loaddup_pd
(
(
double
*
)(
&
hh_dbl
[((
i
-
1
)
*
2
)
+
1
])
)));
#endif
#ifndef __ELPA_USE_FMA__
// conjugate
h1_imag
=
_SSE_XOR
(
h1_imag
,
sign
);
#endif
tmp1
=
_SSE_MUL
(
h1_imag
,
q1
);
#ifdef __ELPA_USE_FMA__
x1
=
_SSE_ADD
(
x1
,
_mm_msubadd_pd
(
h1_real
,
q1
,
_SSE_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#else
x1
=
_SSE_ADD
(
x1
,
_SSE_ADDSUB
(
_SSE_MUL
(
h1_real
,
q1
),
_SSE_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#endif
tmp2
=
_SSE_MUL
(
h1_imag
,
q2
);
#ifdef __ELPA_USE_FMA__
x2
=
_SSE_ADD
(
x2
,
_mm_msubadd_pd
(
h1_real
,
q2
,
_SSE_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
#else
x2
=
_SSE_ADD
(
x2
,
_SSE_ADDSUB
(
_SSE_MUL
(
h1_real
,
q2
),
_SSE_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp3
=
_SSE_MUL
(
h1_imag
,
q3
);
#ifdef __ELPA_USE_FMA__
x3
=
_SSE_ADD
(
x3
,
_mm_msubadd_pd
(
h1_real
,
q3
,
_SSE_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#else
x3
=
_SSE_ADD
(
x3
,
_SSE_ADDSUB
(
_SSE_MUL
(
h1_real
,
q3
),
_SSE_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#endif
tmp4
=
_SSE_MUL
(
h1_imag
,
q4
);
#ifdef __ELPA_USE_FMA__
x4
=
_SSE_ADD
(
x4
,
_mm_msubadd_pd
(
h1_real
,
q4
,
_SSE_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
#else
x4
=
_SSE_ADD
(
x4
,
_SSE_ADDSUB
(
_SSE_MUL
(
h1_real
,
q4
),
_SSE_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
#endif
#endif
/* DOUBLE_PRECISION_COMPLEX */
#ifdef DOUBLE_PRECISION_COMPLEX
h2_real
=
_mm_loaddup_pd
(
&
hh_dbl
[(
ldh
+
i
)
*
2
]);
h2_imag
=
_mm_loaddup_pd
(
&
hh_dbl
[((
ldh
+
i
)
*
2
)
+
1
]);