Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
0d1da4da
Commit
0d1da4da
authored
Jun 12, 2019
by
Andreas Marek
Browse files
Unify complex avx512 block1 kernel
parent
7de3d8d3
Changes
6
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
Doxyfile.in
View file @
0d1da4da
...
...
@@ -935,7 +935,6 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_simple.F90 \
@top_srcdir@/src/elpa2/kernels/complex.F90 \
@top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c \
...
...
Makefile.am
View file @
0d1da4da
...
...
@@ -791,7 +791,6 @@ EXTRA_DIST = \
src/elpa2/elpa2_trans_ev_band_to_full_template.F90
\
src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90
\
src/elpa2/elpa2_tridiag_band_template.F90
\
src/elpa2/kernels/complex_avx512_1hv_template.c
\
src/elpa2/kernels/complex_avx512_2hv_template.c
\
src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c
\
src/elpa2/kernels/complex_template.F90
\
...
...
src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c
View file @
0d1da4da
...
...
@@ -68,9 +68,10 @@
//define instruction set numbers
#define SSE_128 128
#define AVX_256 256
#define AVX_512 512
#define NEON_ARCH64_128 1285
#if VEC_SET == SSE_128 || VEC_SET == AVX_256 || VEC_SET == 512
#if VEC_SET == SSE_128 || VEC_SET == AVX_256 || VEC_SET ==
AVX_
512
#include
<x86intrin.h>
#ifdef BLOCK2
#if VEC_SET == SSE_128
...
...
@@ -106,6 +107,11 @@
#define SIMD_SET AVX_AVX2
#endif
#if VEC_SET == AVX_512
#define SIMD_SET AVX512
#endif
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
...
...
@@ -225,6 +231,72 @@
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
#define offset 8
#define __SIMD_DATATYPE __m512d
#define _SIMD_LOAD _mm512_load_pd
#define _SIMD_LOADU 1
#define _SIMD_STORE _mm512_store_pd
#define _SIMD_STOREU 1
#define _SIMD_MUL _mm512_mul_pd
#define _SIMD_ADD _mm512_add_pd
#ifdef HAVE_AVX512_XEON
#define _SIMD_XOR _mm512_xor_pd
#endif
#define _SIMD_BROADCAST 1
#define _SIMD_SET1 _mm512_set1_pd
#define _SIMD_XOR_EPI _mm512_xor_epi64
#define _SIMD_ADDSUB 1
#define _SIMD_SHUFFLE _mm512_shuffle_pd
#define _SHUFFLE 0x55
#ifdef HAVE_AVX512
#define __ELPA_USE_FMA__
#define _mm512_FMADDSUB_pd(a,b,c) _mm512_fmaddsub_pd(a,b,c)
#define _mm512_FMSUBADD_pd(a,b,c) _mm512_fmsubadd_pd(a,b,c)
#define _SIMD_FMADDSUB _mm512_FMADDSUB_pd
#define _SIMD_FMSUBADD _mm512_FMSUBADD_pd
#endif
/* HAVE_AVX512 */
#endif
/* DOUBLE_PRECISION_COMPLEX */
#ifdef SINGLE_PRECISION_COMPLEX
#define offset 16
#define __SIMD_DATATYPE __m512
#define _SIMD_LOAD _mm512_load_ps
#define _SIMD_LOADU 1
#define _SIMD_STORE _mm512_store_ps
#define _SIMD_STOREU 1
#define _SIMD_MUL _mm512_mul_ps
#define _SIMD_ADD _mm512_add_ps
#ifdef HAVE_AVX512_XEON
#define _SIMD_XOR _mm512_xor_ps
#endif
#define _SIMD_BROADCAST 1
#define _SIMD_SET1 _mm512_set1_ps
#define _SIMD_ADDSUB 1
#define _SIMD_SHUFFLE _mm512_shuffle_ps
#define _SIMD_XOR_EPI _mm512_xor_epi32
#define _SHUFFLE 0xb1
#ifdef HAVE_AVX512
#define __ELPA_USE_FMA__
#define _mm512_FMADDSUB_ps(a,b,c) _mm512_fmaddsub_ps(a,b,c)
#define _mm512_FMSUBADD_ps(a,b,c) _mm512_fmsubadd_ps(a,b,c)
#define _SIMD_FMADDSUB _mm512_FMADDSUB_ps
#define _SIMD_FMSUBADD _mm512_FMSUBADD_ps
#endif
/* HAVE_AVX512 */
#endif
/* SINGLE_PRECISION_COMPLEX */
#endif
/* VEC_SET == AVX_512 */
#define __forceinline __attribute__((always_inline))
...
...
@@ -250,6 +322,8 @@
#endif
//Forward declaration
#if VEC_SET == SSE_128
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
...
...
@@ -272,7 +346,16 @@
#endif
#endif
/* VEC_SET == AVX_256 */
//Forward declaration
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 24
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 48
#endif
#endif
/* VEC_SET == AVX_512 */
static
__forceinline
void
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
nb
,
int
ldq
#ifdef BLOCK1
);
...
...
@@ -292,7 +375,6 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif
#endif
/* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
...
...
@@ -304,6 +386,16 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 20
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 40
#endif
#endif
/* VEC_SET == AVX_512 */
static
__forceinline
void
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
nb
,
int
ldq
#ifdef BLOCK1
...
...
@@ -325,7 +417,6 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif
#endif
/* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
...
...
@@ -337,6 +428,16 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 16
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 32
#endif
#endif
/* VEC_SET == AVX_512 */
static
__forceinline
void
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
nb
,
int
ldq
#ifdef BLOCK1
...
...
@@ -357,7 +458,6 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif
#endif
/* VEC_SET == SSE_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
...
...
@@ -369,6 +469,16 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 12
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 24
#endif
#endif
/* VEC_SET == AVX_512 */
static
__forceinline
void
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
nb
,
int
ldq
#ifdef BLOCK1
...
...
@@ -400,6 +510,16 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 8
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 16
#endif
#endif
/* VEC_SET == AVX_512 */
static
__forceinline
void
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
nb
,
int
ldq
#ifdef BLOCK1
...
...
@@ -431,6 +551,17 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 4
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#undef ROW_LENGTH
#define ROW_LENGTH 8
#endif
#endif
/* VEC_SET == AVX_512 */
static
__forceinline
void
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
nb
,
int
ldq
#ifdef BLOCK1
);
...
...
@@ -501,6 +632,37 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
!f>#endif
*/
/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f> subroutine single_hh_trafo_complex_AVX512_1hv_double(q, hh, pnb, pnq, pldq) &
!f> bind(C, name="single_hh_trafo_complex_AVX512_1hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq
!f> ! complex(kind=c_double_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_double_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f> subroutine single_hh_trafo_complex_AVX512_1hv_single(q, hh, pnb, pnq, pldq) &
!f> bind(C, name="single_hh_trafo_complex_AVX512_1hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq
!f> ! complex(kind=c_float_complex) :: q(*)
!f> type(c_ptr), value :: q
!f> complex(kind=c_float_complex) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
...
...
@@ -615,12 +777,52 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#endif
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 24
#define STEP_SIZE 24
#define UPPER_BOUND 20
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 48
#define STEP_SIZE 48
#define UPPER_BOUND 40
#endif
#endif
/* VEC_SET == AVX_512 */
//#if VEC_SET != AVX_512
for
(
i
=
0
;
i
<
nq
-
UPPER_BOUND
;
i
+=
STEP_SIZE
)
{
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
&
q
[
i
],
hh
,
nb
,
ldq
);
worked_on
+=
ROW_LENGTH
;
}
//#else
// for (i = 0; i < nq - UPPER_BOUND; i+= STEP_SIZE)
// {
//
//#ifdef DOUBLE_PRECISION_COMPLEX
// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i], hh, nb, ldq);
// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+4], hh, nb, ldq);
// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+8], hh, nb, ldq);
// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+12], hh, nb, ldq);
// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+16], hh, nb, ldq);
// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+20], hh, nb, ldq);
//#endif
//#ifdef SINGLE_PRECISION_COMPLEX
// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i], hh, nb, ldq);
// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+8], hh, nb, ldq);
// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+16], hh, nb, ldq);
// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+24], hh, nb, ldq);
// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+32], hh, nb, ldq);
// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+40], hh, nb, ldq);
//#endif
// worked_on += ROW_LENGTH;
// }
//#endif
if
(
nq
==
i
)
{
return
;
}
...
...
@@ -645,11 +847,42 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#endif
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 20
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 40
#endif
#endif
/* VEC_SET == AVX_512 */
//#if VEC_SET != AVX_512
if
(
nq
-
i
==
ROW_LENGTH
)
{
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
&
q
[
i
],
hh
,
nb
,
ldq
);
worked_on
+=
ROW_LENGTH
;
}
//#else
// if (nq-i == ROW_LENGTH)
// {
//#ifdef DOUBLE_PRECISION_COMPLEX
// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i], hh, nb, ldq);
// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+4], hh, nb, ldq);
// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+8], hh, nb, ldq);
// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+12], hh, nb, ldq);
// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+16], hh, nb, ldq);
//#endif
//#ifdef SINGLE_PRECISION_COMPLEX
// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i], hh, nb, ldq);
// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+8], hh, nb, ldq);
// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+16], hh, nb, ldq);
// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+24], hh, nb, ldq);
// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+32], hh, nb, ldq);
//#endif
// worked_on += ROW_LENGTH;
// }
//#endif
#if VEC_SET == SSE_128
...
...
@@ -672,11 +905,42 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#endif
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 16
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 32
#endif
#endif
/* VEC_SET == AVX_512 */
//#if VEC_SET != AVX_512
if
(
nq
-
i
==
ROW_LENGTH
)
{
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
&
q
[
i
],
hh
,
nb
,
ldq
);
worked_on
+=
ROW_LENGTH
;
}
//#else
// if (nq-i == ROW_LENGTH)
// {
//#ifdef DOUBLE_PRECISION_COMPLEX
// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i], hh, nb, ldq);
// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+4], hh, nb, ldq);
// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+8], hh, nb, ldq);
// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+12], hh, nb, ldq);
//#endif
//#ifdef SINGLE_PRECISION_COMPLEX
// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i], hh, nb, ldq);
// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+8], hh, nb, ldq);
// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+16], hh, nb, ldq);
// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+24], hh, nb, ldq);
//#endif
//
// worked_on += ROW_LENGTH;
// }
//#endif
#if VEC_SET == SSE_128
#undef ROW_LENGTH
...
...
@@ -698,11 +962,38 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#endif
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 12
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 24
#endif
#endif
/* VEC_SET == AVX_512 */
//#if VEC_SET != AVX_512
if
(
nq
-
i
==
ROW_LENGTH
)
{
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
&
q
[
i
],
hh
,
nb
,
ldq
);
worked_on
+=
ROW_LENGTH
;
}
//#else
// if (nq-i == ROW_LENGTH)
// {
//#ifdef DOUBLE_PRECISION_COMPLEX
// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i], hh, nb, ldq);
// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+4], hh, nb, ldq);
// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+8], hh, nb, ldq);
//#endif
//#ifdef SINGLE_PRECISION_COMPLEX
// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i], hh, nb, ldq);
// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+8], hh, nb, ldq);
// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+16], hh, nb, ldq);
//#endif
// worked_on += ROW_LENGTH;
// }
//#endif
#if VEC_SET == SSE_128
#undef ROW_LENGTH
...
...
@@ -724,12 +1015,36 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#endif
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 8
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 16
#endif
#endif
/* VEC_SET == AVX_512 */
//#if VEC_SET != AVX_512
if
(
nq
-
i
==
ROW_LENGTH
)
{
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
&
q
[
i
],
hh
,
nb
,
ldq
);
worked_on
+=
ROW_LENGTH
;
}
//#else
// if (nq-i == ROW_LENGTH)
// {
//#ifdef DOUBLE_PRECISION_COMPLEX
// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i], hh, nb, ldq);
// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+4], hh, nb, ldq);
//#endif
//#ifdef SINGLE_PRECISION_COMPLEX
// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i], hh, nb, ldq);
// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+8], hh, nb, ldq);
//#endif
// worked_on += ROW_LENGTH;
// }
//#endif
#if VEC_SET == SSE_128
#undef ROW_LENGTH
...
...
@@ -750,11 +1065,35 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#define ROW_LENGTH 4
#endif
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#undef ROW_LENGTH
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 4
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 8
#endif
#endif
/* VEC_SET == AVX_512 */
//#if VEC_SET != AVX_512
if
(
nq
-
i
==
ROW_LENGTH
)
{
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
&
q
[
i
],
hh
,
nb
,
ldq
);
worked_on
+=
ROW_LENGTH
;
}
//#else
// if (nq-i == ROW_LENGTH)
// {
//#ifdef DOUBLE_PRECISION_COMPLEX
// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i], hh, nb, ldq);
//#endif
//#ifdef SINGLE_PRECISION_COMPLEX
// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i], hh, nb, ldq);
//#endif
// worked_on += ROW_LENGTH;
// }
//#endif
#endif
/* BLOCK1 */
...
...
@@ -908,6 +1247,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D
#endif
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
#define ROW_LENGTH 24
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define ROW_LENGTH 48
#endif
#endif
/* VEC_SET == AVX_512 */
static
__forceinline
void
CONCAT_8ARGS
(
hh_trafo_complex_kernel_
,
ROW_LENGTH
,
_
,
SIMD_SET
,
_
,
BLOCK
,
hv_
,
WORD_LENGTH
)
(
DATA_TYPE_PTR
q
,
DATA_TYPE_PTR
hh
,
int
nb
,
int
ldq
#ifdef BLOCK1
)
...
...
@@ -951,6 +1298,15 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
#endif
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
#ifdef DOUBLE_PRECISION_COMPLEX
__SIMD_DATATYPE
sign
=
(
__SIMD_DATATYPE
)
_mm512_set_epi64
(
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
,
0x8000000000000000
);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
__SIMD_DATATYPE
sign
=
(
__SIMD_DATATYPE
)
_mm512_set1_epi32
(
0x80000000
);
#endif
#endif
/* VEC_SET == AVX_512 */
#ifdef BLOCK2
x1
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
ldq
)
+
0
]);
x2
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
ldq
)
+
offset
]);
...
...
@@ -1056,6 +1412,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h1_imag
=
_SIMD_BROADCAST
(
&
hh_dbl
[((
i
-
BLOCK
+
1
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h1_real
=
_SIMD_SET1
(
hh_dbl
[(
i
-
BLOCK
+
1
)
*
2
]);
h1_imag
=
_SIMD_SET1
(
hh_dbl
[((
i
-
BLOCK
+
1
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_512 */
#ifndef __ELPA_USE_FMA__
// conjugate
h1_imag
=
_SIMD_XOR
(
h1_imag
,
sign
);
...
...
@@ -1259,8 +1620,34 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h1_imag
=
_SIMD_BROADCAST
(
&
hh_dbl
[
1
]);
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h1_real
=
_SIMD_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_SIMD_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__SIMD_DATATYPE
)
_SIMD_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__SIMD_DATATYPE
)
_SIMD_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__SIMD_DATATYPE
)
_SIMD_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__SIMD_DATATYPE
)
_SIMD_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_SIMD_XOR
(
h1_real
,
sign
);
h1_imag
=
_SIMD_XOR
(
h1_imag
,
sign
);
#endif
#endif
#endif
/* VEC_SET == AVX_512 */
#if VEC_SET != AVX_512
h1_real
=
_SIMD_XOR
(
h1_real
,
sign
);
h1_imag
=
_SIMD_XOR
(
h1_imag
,
sign
);
#endif
/* VEC_SET != AVX_512 */
tmp1
=
_SIMD_MUL
(
h1_imag
,
x1
);
#ifdef __ELPA_USE_FMA__
...
...
@@ -1589,6 +1976,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM
h1_imag
=
_SIMD_BROADCAST
(
&
hh_dbl
[((
i
-
BLOCK
+
1
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_256 */
#if VEC_SET == AVX_512
h1_real
=
_SIMD_SET1
(
hh_dbl
[(
i
-
BLOCK
+
1
)
*
2
]);
h1_imag
=
_SIMD_SET1
(
hh_dbl
[((
i
-
BLOCK
+
1
)
*
2
)
+
1
]);
#endif
/* VEC_SET == AVX_512 */
q1
=
_SIMD_LOAD
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
0
]);