Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Sebastian Ohlmann
elpa
Commits
9493dde8
Commit
9493dde8
authored
Jun 04, 2019
by
Andreas Marek
Browse files
Start to unify complex SSE block1 block2 kernels
parent
777d66f8
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
Doxyfile.in
View file @
9493dde8
...
...
@@ -929,7 +929,6 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_bgq.f90 \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_single_precision.c \
...
...
Makefile.am
View file @
9493dde8
...
...
@@ -796,7 +796,6 @@ EXTRA_DIST = \
src/elpa2/kernels/complex_avx512_1hv_template.c
\
src/elpa2/kernels/complex_avx512_2hv_template.c
\
src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c
\
src/elpa2/kernels/complex_sse_2hv_template.c
\
src/elpa2/kernels/complex_template.F90
\
src/elpa2/kernels/real_vsx_4hv_template.c
\
src/elpa2/kernels/real_vsx_6hv_template.c
\
...
...
src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c
View file @
9493dde8
...
...
@@ -74,6 +74,9 @@
#ifdef BLOCK2
#include
<pmmintrin.h>
#endif
#define __forceinline __attribute__((always_inline))
#endif
...
...
@@ -102,6 +105,7 @@
#define _SIMD_LOAD _mm_load_pd
#define _SIMD_LOADU _mm_loadu_pd
#define _SIMD_STORE _mm_store_pd
#define _SIMD_STOREU _mm_storeu_pd
#define _SIMD_MUL _mm_mul_pd
#define _SIMD_ADD _mm_add_pd
#define _SIMD_XOR _mm_xor_pd
...
...
@@ -116,6 +120,7 @@
#define _SIMD_LOAD _mm_load_ps
#define _SIMD_LOADU _mm_loadu_ps
#define _SIMD_STORE _mm_store_ps
#define _SIMD_STOREU _mm_storeu_ps
#define _SIMD_MUL _mm_mul_ps
#define _SIMD_ADD _mm_add_ps
#define _SIMD_XOR _mm_xor_ps
...
...
@@ -234,6 +239,37 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
!f>#endif
*/
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine double_hh_trafo_complex_SSE_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_SSE_2hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> ! complex(kind=c_double_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_double_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine double_hh_trafo_complex_SSE_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_SSE_2hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> ! complex(kind=c_float_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_float_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
void
CONCAT_7ARGS
(
PREFIX
,
_hh_trafo_complex_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
#ifdef BLOCK1
)
...
...
@@ -259,6 +295,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#endif
worked_on
=
0
;
#ifdef BLOCK1
#ifdef DOUBLE_PRECISION_COMPLEX
...
...
@@ -860,13 +897,24 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
q5
=
_SIMD_LOAD
(
&
q_dbl
[
4
*
offset
]);
q6
=
_SIMD_LOAD
(
&
q_dbl
[
5
*
offset
]);
#ifdef BLOCK1
q1
=
_SIMD_ADD
(
q1
,
x1
);
q2
=
_SIMD_ADD
(
q2
,
x2
);
q3
=
_SIMD_ADD
(
q3
,
x3
);
q4
=
_SIMD_ADD
(
q4
,
x4
);
q5
=
_SIMD_ADD
(
q5
,
x5
);
q6
=
_SIMD_ADD
(
q6
,
x6
);
#endif
#ifdef BLOCK2
q1
=
_SIMD_ADD
(
q1
,
y1
);
q2
=
_SIMD_ADD
(
q2
,
y2
);
q3
=
_SIMD_ADD
(
q3
,
y3
);
q4
=
_SIMD_ADD
(
q4
,
y4
);
q5
=
_SIMD_ADD
(
q5
,
y5
);
q6
=
_SIMD_ADD
(
q6
,
y6
);
#endif
_SIMD_STORE
(
&
q_dbl
[
0
],
q1
);
_SIMD_STORE
(
&
q_dbl
[
offset
],
q2
);
_SIMD_STORE
(
&
q_dbl
[
2
*
offset
],
q3
);
...
...
@@ -1209,6 +1257,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
y1
=
_SIMD_ADD
(
y1
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h2_real
,
x1
),
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#endif
tmp2
=
_SIMD_MUL
(
h2_imag
,
x2
);
#ifdef __ELPA_USE_FMA__
y2
=
_SIMD_ADD
(
y2
,
_mm_msubadd_pd
(
h2_real
,
x2
,
_SIMD_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
...
...
@@ -1222,6 +1271,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
y3
=
_SIMD_ADD
(
y3
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h2_real
,
x3
),
_SIMD_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#endif
tmp4
=
_SIMD_MUL
(
h2_imag
,
x4
);
#ifdef __ELPA_USE_FMA__
y4
=
_SIMD_ADD
(
y4
,
_mm_msubadd_pd
(
h2_real
,
x4
,
_SIMD_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
...
...
@@ -1362,6 +1412,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
x1
=
_SIMD_ADD
(
x1
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h1_real
,
q1
),
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#endif
tmp2
=
_SIMD_MUL
(
h1_imag
,
q2
);
#ifdef __ELPA_USE_FMA__
x2
=
_SIMD_ADD
(
x2
,
_mm_msubadd_pd
(
h1_real
,
q2
,
_SIMD_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
...
...
@@ -1375,6 +1426,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
x3
=
_SIMD_ADD
(
x3
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h1_real
,
q3
),
_SIMD_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#endif
tmp4
=
_SIMD_MUL
(
h1_imag
,
q4
);
#ifdef __ELPA_USE_FMA__
x4
=
_SIMD_ADD
(
x4
,
_mm_msubadd_pd
(
h1_real
,
q4
,
_SIMD_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
...
...
@@ -1405,6 +1457,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
x1
=
_SIMD_ADDSUB
(
_SIMD_MUL
(
h1_real
,
x1
),
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
));
#endif
tmp2
=
_SIMD_MUL
(
h1_imag
,
x2
);
#ifdef __ELPA_USE_FMA__
x2
=
_SIMD_MADDSUB
(
h1_real
,
x2
,
_SIMD_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
));
...
...
@@ -1418,6 +1471,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
x3
=
_SIMD_ADDSUB
(
_SIMD_MUL
(
h1_real
,
x3
),
_SIMD_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
));
#endif
tmp4
=
_SIMD_MUL
(
h1_imag
,
x4
);
#ifdef __ELPA_USE_FMA__
x4
=
_SIMD_MADDSUB
(
h1_real
,
x4
,
_SIMD_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
));
...
...
@@ -1478,6 +1532,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
y1
=
_SIMD_ADDSUB
(
_SIMD_MUL
(
h1_real
,
y1
),
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
));
#endif
tmp2
=
_SIMD_MUL
(
h1_imag
,
y2
);
#ifdef __ELPA_USE_FMA__
y2
=
_mm_maddsub_pd
(
h1_real
,
y2
,
_SIMD_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
));
...
...
@@ -1491,6 +1546,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
y3
=
_SIMD_ADDSUB
(
_SIMD_MUL
(
h1_real
,
y3
),
_SIMD_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
));
#endif
tmp4
=
_SIMD_MUL
(
h1_imag
,
y4
);
#ifdef __ELPA_USE_FMA__
y4
=
_mm_maddsub_pd
(
h1_real
,
y4
,
_SIMD_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
));
...
...
@@ -1504,6 +1560,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
y1
=
_SIMD_ADD
(
y1
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h2_real
,
x1
),
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#endif
tmp2
=
_SIMD_MUL
(
h2_imag
,
x2
);
#ifdef __ELPA_USE_FMA__
y2
=
_SIMD_ADD
(
y2
,
_mm_maddsub_pd
(
h2_real
,
x2
,
_SIMD_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
...
...
@@ -1517,6 +1574,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
y3
=
_SIMD_ADD
(
y3
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h2_real
,
x3
),
_SIMD_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#endif
tmp4
=
_SIMD_MUL
(
h2_imag
,
x4
);
#ifdef __ELPA_USE_FMA__
y4
=
_SIMD_ADD
(
y4
,
_mm_maddsub_pd
(
h2_real
,
x4
,
_SIMD_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
...
...
@@ -1531,10 +1589,19 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
q3
=
_SIMD_LOAD
(
&
q_dbl
[
2
*
offset
]);
q4
=
_SIMD_LOAD
(
&
q_dbl
[
3
*
offset
]);
#ifdef BLOCK1
q1
=
_SIMD_ADD
(
q1
,
x1
);
q2
=
_SIMD_ADD
(
q2
,
x2
);
q3
=
_SIMD_ADD
(
q3
,
x3
);
q4
=
_SIMD_ADD
(
q4
,
x4
);
#endif
#ifdef BLOCK2
q1
=
_SIMD_ADD
(
q1
,
y1
);
q2
=
_SIMD_ADD
(
q2
,
y2
);
q3
=
_SIMD_ADD
(
q3
,
y3
);
q4
=
_SIMD_ADD
(
q4
,
y4
);
#endif
_SIMD_STORE
(
&
q_dbl
[
0
],
q1
);
_SIMD_STORE
(
&
q_dbl
[
offset
],
q2
);
...
...
@@ -1569,6 +1636,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
q1
=
_SIMD_ADD
(
q1
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h2_real
,
y1
),
_SIMD_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
#endif
tmp2
=
_SIMD_MUL
(
h2_imag
,
y2
);
#ifdef __ELPA_USE_FMA__
q2
=
_SIMD_ADD
(
q2
,
_mm_maddsub_pd
(
h2_real
,
y2
,
_SIMD_SHUFFLE
(
tmp2
,
tmp2
,
_SHUFFLE
)));
...
...
@@ -1582,6 +1650,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#else
q3
=
_SIMD_ADD
(
q3
,
_SIMD_ADDSUB
(
_SIMD_MUL
(
h2_real
,
y3
),
_SIMD_SHUFFLE
(
tmp3
,
tmp3
,
_SHUFFLE
)));
#endif
tmp4
=
_SIMD_MUL
(
h2_imag
,
y4
);
#ifdef __ELPA_USE_FMA__
q4
=
_SIMD_ADD
(
q4
,
_mm_maddsub_pd
(
h2_real
,
y4
,
_SIMD_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
...
...
@@ -1614,6 +1683,7 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
q2
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
offset
]);
q3
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
2
*
offset
]);
q4
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
3
*
offset
]);
tmp1
=
_SIMD_MUL
(
h1_imag
,
x1
);
#ifdef __ELPA_USE_FMA__
...
...
@@ -2030,8 +2100,16 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
q1
=
_SIMD_LOAD
(
&
q_dbl
[
0
]);
q2
=
_SIMD_LOAD
(
&
q_dbl
[
offset
]);
#ifdef BLOCK1
q1
=
_SIMD_ADD
(
q1
,
x1
);
q2
=
_SIMD_ADD
(
q2
,
x2
);
#endif
#ifdef BLOCK2
q1
=
_SIMD_ADD
(
q1
,
y1
);
q2
=
_SIMD_ADD
(
q2
,
y2
);
#endif
_SIMD_STORE
(
&
q_dbl
[
0
],
q1
);
_SIMD_STORE
(
&
q_dbl
[
offset
],
q2
);
...
...
src/elpa2/kernels/complex_sse_2hv_double_precision.c
View file @
9493dde8
...
...
@@ -48,8 +48,12 @@
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#define VEC_SET SSE_128
#define BLOCK2 1
#include
"../../general/precision_macros.h"
#include
"complex_sse_2hv_template.c"
#include
"complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK2
#undef DOUBLE_PRECISION
#undef COMPLEXCASE
src/elpa2/kernels/complex_sse_2hv_single_precision.c
View file @
9493dde8
...
...
@@ -48,8 +48,12 @@
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#define VEC_SET SSE_128
#define BLOCK2 1
#include
"../../general/precision_macros.h"
#include
"complex_sse_2hv_template.c"
#include
"complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK2
#undef SINGLE_PRECISION
#undef COMPLEXCASE
src/elpa2/kernels/complex_sse_2hv_template.c
deleted
100644 → 0
View file @
777d66f8
This diff is collapsed.
Click to expand it.
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment