diff --git a/Doxyfile.in b/Doxyfile.in index c2c6f123c16ae2787589b19d5c46e66176fc2b7e..024d4a086254eac63e6038dc18ee2398b27aa463 100644 --- a/Doxyfile.in +++ b/Doxyfile.in @@ -935,7 +935,6 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \ @top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_template.c \ @top_srcdir@/src/elpa2/kernels/real_simple.F90 \ @top_srcdir@/src/elpa2/kernels/complex.F90 \ - @top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_template.c \ @top_srcdir@/src/elpa2/kernels/real_sse_2hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_sparc64_2hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c \ diff --git a/Makefile.am b/Makefile.am index 0463e74461885ec7c10969a933e1ce9acc3873e3..321f9064dc2ed4f8e768529b895ae6cfc349fa5a 100644 --- a/Makefile.am +++ b/Makefile.am @@ -791,7 +791,6 @@ EXTRA_DIST = \ src/elpa2/elpa2_trans_ev_band_to_full_template.F90 \ src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 \ src/elpa2/elpa2_tridiag_band_template.F90 \ - src/elpa2/kernels/complex_avx512_1hv_template.c \ src/elpa2/kernels/complex_avx512_2hv_template.c \ src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c \ src/elpa2/kernels/complex_template.F90 \ diff --git a/src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c b/src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c index aeddab6a2e2c7d2e71aae12399f2e7476ab1ba3c..f999f185d2d2916b2386a1f7056721d74175bbcd 100644 --- a/src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c +++ b/src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c @@ -68,9 +68,10 @@ //define instruction set numbers #define SSE_128 128 #define AVX_256 256 +#define AVX_512 512 #define NEON_ARCH64_128 1285 -#if VEC_SET == SSE_128 || VEC_SET == AVX_256 || VEC_SET == 512 +#if VEC_SET == SSE_128 || VEC_SET == AVX_256 || VEC_SET == AVX_512 #include #ifdef BLOCK2 #if VEC_SET == SSE_128 @@ -106,6 +107,11 @@ #define SIMD_SET AVX_AVX2 #endif +#if VEC_SET == AVX_512 +#define SIMD_SET AVX512 +#endif + + #if VEC_SET == SSE_128 #ifdef DOUBLE_PRECISION_COMPLEX @@ -225,6 +231,72 @@ #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 + +#ifdef DOUBLE_PRECISION_COMPLEX +#define offset 8 +#define __SIMD_DATATYPE __m512d +#define _SIMD_LOAD _mm512_load_pd +#define _SIMD_LOADU 1 +#define _SIMD_STORE _mm512_store_pd +#define _SIMD_STOREU 1 +#define _SIMD_MUL _mm512_mul_pd +#define _SIMD_ADD _mm512_add_pd +#ifdef HAVE_AVX512_XEON +#define _SIMD_XOR _mm512_xor_pd +#endif +#define _SIMD_BROADCAST 1 +#define _SIMD_SET1 _mm512_set1_pd +#define _SIMD_XOR_EPI _mm512_xor_epi64 +#define _SIMD_ADDSUB 1 +#define _SIMD_SHUFFLE _mm512_shuffle_pd +#define _SHUFFLE 0x55 + +#ifdef HAVE_AVX512 +#define __ELPA_USE_FMA__ +#define _mm512_FMADDSUB_pd(a,b,c) _mm512_fmaddsub_pd(a,b,c) +#define _mm512_FMSUBADD_pd(a,b,c) _mm512_fmsubadd_pd(a,b,c) + +#define _SIMD_FMADDSUB _mm512_FMADDSUB_pd +#define _SIMD_FMSUBADD _mm512_FMSUBADD_pd +#endif /* HAVE_AVX512 */ + +#endif /* DOUBLE_PRECISION_COMPLEX */ + +#ifdef SINGLE_PRECISION_COMPLEX +#define offset 16 +#define __SIMD_DATATYPE __m512 +#define _SIMD_LOAD _mm512_load_ps +#define _SIMD_LOADU 1 +#define _SIMD_STORE _mm512_store_ps +#define _SIMD_STOREU 1 +#define _SIMD_MUL _mm512_mul_ps +#define _SIMD_ADD _mm512_add_ps +#ifdef HAVE_AVX512_XEON +#define _SIMD_XOR _mm512_xor_ps +#endif +#define _SIMD_BROADCAST 1 +#define _SIMD_SET1 _mm512_set1_ps +#define _SIMD_ADDSUB 1 +#define _SIMD_SHUFFLE _mm512_shuffle_ps +#define _SIMD_XOR_EPI _mm512_xor_epi32 +#define _SHUFFLE 0xb1 + +#ifdef HAVE_AVX512 + +#define __ELPA_USE_FMA__ +#define _mm512_FMADDSUB_ps(a,b,c) _mm512_fmaddsub_ps(a,b,c) +#define _mm512_FMSUBADD_ps(a,b,c) _mm512_fmsubadd_ps(a,b,c) + +#define _SIMD_FMADDSUB _mm512_FMADDSUB_ps +#define _SIMD_FMSUBADD _mm512_FMSUBADD_ps +#endif /* HAVE_AVX512 */ + +#endif /* SINGLE_PRECISION_COMPLEX */ + +#endif /* VEC_SET == AVX_512 */ + + #define __forceinline __attribute__((always_inline)) @@ -250,6 +322,8 @@ #endif +//Forward declaration + #if VEC_SET == SSE_128 #ifdef DOUBLE_PRECISION_COMPLEX #undef ROW_LENGTH @@ -272,7 +346,16 @@ #endif #endif /* VEC_SET == AVX_256 */ -//Forward declaration +#if VEC_SET == AVX_512 +#ifdef DOUBLE_PRECISION_COMPLEX +#undef ROW_LENGTH +#define ROW_LENGTH 24 +#endif +#ifdef SINGLE_PRECISION_COMPLEX +#undef ROW_LENGTH +#define ROW_LENGTH 48 +#endif +#endif /* VEC_SET == AVX_512 */ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq #ifdef BLOCK1 ); @@ -292,7 +375,6 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM #endif #endif /* VEC_SET == SSE_128 */ - #if VEC_SET == AVX_256 #ifdef DOUBLE_PRECISION_COMPLEX #undef ROW_LENGTH @@ -304,6 +386,16 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM #endif #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 +#ifdef DOUBLE_PRECISION_COMPLEX +#undef ROW_LENGTH +#define ROW_LENGTH 20 +#endif +#ifdef SINGLE_PRECISION_COMPLEX +#undef ROW_LENGTH +#define ROW_LENGTH 40 +#endif +#endif /* VEC_SET == AVX_512 */ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq #ifdef BLOCK1 @@ -325,7 +417,6 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM #endif #endif /* VEC_SET == SSE_128 */ - #if VEC_SET == AVX_256 #ifdef DOUBLE_PRECISION_COMPLEX #undef ROW_LENGTH @@ -337,6 +428,16 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM #endif #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 +#ifdef DOUBLE_PRECISION_COMPLEX +#undef ROW_LENGTH +#define ROW_LENGTH 16 +#endif +#ifdef SINGLE_PRECISION_COMPLEX +#undef ROW_LENGTH +#define ROW_LENGTH 32 +#endif +#endif /* VEC_SET == AVX_512 */ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq #ifdef BLOCK1 @@ -357,7 +458,6 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM #endif #endif /* VEC_SET == SSE_128 */ - #if VEC_SET == AVX_256 #ifdef DOUBLE_PRECISION_COMPLEX #undef ROW_LENGTH @@ -369,6 +469,16 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM #endif #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 +#ifdef DOUBLE_PRECISION_COMPLEX +#undef ROW_LENGTH +#define ROW_LENGTH 12 +#endif +#ifdef SINGLE_PRECISION_COMPLEX +#undef ROW_LENGTH +#define ROW_LENGTH 24 +#endif +#endif /* VEC_SET == AVX_512 */ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq #ifdef BLOCK1 @@ -400,6 +510,16 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM #endif #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 +#ifdef DOUBLE_PRECISION_COMPLEX +#undef ROW_LENGTH +#define ROW_LENGTH 8 +#endif +#ifdef SINGLE_PRECISION_COMPLEX +#undef ROW_LENGTH +#define ROW_LENGTH 16 +#endif +#endif /* VEC_SET == AVX_512 */ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq #ifdef BLOCK1 @@ -431,6 +551,17 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM #endif #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 +#ifdef DOUBLE_PRECISION_COMPLEX +#undef ROW_LENGTH +#define ROW_LENGTH 4 +#endif +#ifdef SINGLE_PRECISION_COMPLEX +#undef ROW_LENGTH +#define ROW_LENGTH 8 +#endif +#endif /* VEC_SET == AVX_512 */ + static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq #ifdef BLOCK1 ); @@ -501,6 +632,37 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM !f>#endif */ +/* +!f>#if defined(HAVE_AVX512) +!f> interface +!f> subroutine single_hh_trafo_complex_AVX512_1hv_double(q, hh, pnb, pnq, pldq) & +!f> bind(C, name="single_hh_trafo_complex_AVX512_1hv_double") +!f> use, intrinsic :: iso_c_binding +!f> integer(kind=c_int) :: pnb, pnq, pldq +!f> ! complex(kind=c_double_complex) :: q(*) +!f> type(c_ptr), value :: q +!f> complex(kind=c_double_complex) :: hh(pnb,2) +!f> end subroutine +!f> end interface +!f>#endif +*/ + +/* +!f>#if defined(HAVE_AVX512) +!f> interface +!f> subroutine single_hh_trafo_complex_AVX512_1hv_single(q, hh, pnb, pnq, pldq) & +!f> bind(C, name="single_hh_trafo_complex_AVX512_1hv_single") +!f> use, intrinsic :: iso_c_binding +!f> integer(kind=c_int) :: pnb, pnq, pldq +!f> ! complex(kind=c_float_complex) :: q(*) +!f> type(c_ptr), value :: q +!f> complex(kind=c_float_complex) :: hh(pnb,2) +!f> end subroutine +!f> end interface +!f>#endif +*/ + + /* !f>#ifdef HAVE_SSE_INTRINSICS !f> interface @@ -615,12 +777,52 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D #endif #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 +#ifdef DOUBLE_PRECISION_COMPLEX +#define ROW_LENGTH 24 +#define STEP_SIZE 24 +#define UPPER_BOUND 20 +#endif +#ifdef SINGLE_PRECISION_COMPLEX +#define ROW_LENGTH 48 +#define STEP_SIZE 48 +#define UPPER_BOUND 40 +#endif +#endif /* VEC_SET == AVX_512 */ + + +//#if VEC_SET != AVX_512 for (i = 0; i < nq - UPPER_BOUND; i+= STEP_SIZE) { CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq); worked_on += ROW_LENGTH; } +//#else +// for (i = 0; i < nq - UPPER_BOUND; i+= STEP_SIZE) +// { +// +//#ifdef DOUBLE_PRECISION_COMPLEX +// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i], hh, nb, ldq); +// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+4], hh, nb, ldq); +// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+8], hh, nb, ldq); +// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+12], hh, nb, ldq); +// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+16], hh, nb, ldq); +// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+20], hh, nb, ldq); +//#endif +//#ifdef SINGLE_PRECISION_COMPLEX +// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i], hh, nb, ldq); +// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+8], hh, nb, ldq); +// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+16], hh, nb, ldq); +// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+24], hh, nb, ldq); +// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+32], hh, nb, ldq); +// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+40], hh, nb, ldq); +//#endif +// worked_on += ROW_LENGTH; +// } +//#endif + + if (nq == i) { return; } @@ -645,11 +847,42 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D #endif #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 +#undef ROW_LENGTH +#ifdef DOUBLE_PRECISION_COMPLEX +#define ROW_LENGTH 20 +#endif +#ifdef SINGLE_PRECISION_COMPLEX +#define ROW_LENGTH 40 +#endif +#endif /* VEC_SET == AVX_512 */ + +//#if VEC_SET != AVX_512 if (nq-i == ROW_LENGTH) { CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq); worked_on += ROW_LENGTH; } +//#else +// if (nq-i == ROW_LENGTH) +// { +//#ifdef DOUBLE_PRECISION_COMPLEX +// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i], hh, nb, ldq); +// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+4], hh, nb, ldq); +// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+8], hh, nb, ldq); +// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+12], hh, nb, ldq); +// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+16], hh, nb, ldq); +//#endif +//#ifdef SINGLE_PRECISION_COMPLEX +// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i], hh, nb, ldq); +// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+8], hh, nb, ldq); +// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+16], hh, nb, ldq); +// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+24], hh, nb, ldq); +// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+32], hh, nb, ldq); +//#endif +// worked_on += ROW_LENGTH; +// } +//#endif #if VEC_SET == SSE_128 @@ -672,11 +905,42 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D #endif #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 +#undef ROW_LENGTH +#ifdef DOUBLE_PRECISION_COMPLEX +#define ROW_LENGTH 16 +#endif +#ifdef SINGLE_PRECISION_COMPLEX +#define ROW_LENGTH 32 +#endif +#endif /* VEC_SET == AVX_512 */ + +//#if VEC_SET != AVX_512 if (nq-i == ROW_LENGTH) { CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq); worked_on += ROW_LENGTH; } +//#else +// if (nq-i == ROW_LENGTH) +// { +//#ifdef DOUBLE_PRECISION_COMPLEX +// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i], hh, nb, ldq); +// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+4], hh, nb, ldq); +// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+8], hh, nb, ldq); +// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+12], hh, nb, ldq); +//#endif +//#ifdef SINGLE_PRECISION_COMPLEX +// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i], hh, nb, ldq); +// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+8], hh, nb, ldq); +// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+16], hh, nb, ldq); +// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+24], hh, nb, ldq); +//#endif +// +// worked_on += ROW_LENGTH; +// } +//#endif + #if VEC_SET == SSE_128 #undef ROW_LENGTH @@ -698,11 +962,38 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D #endif #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 +#undef ROW_LENGTH +#ifdef DOUBLE_PRECISION_COMPLEX +#define ROW_LENGTH 12 +#endif +#ifdef SINGLE_PRECISION_COMPLEX +#define ROW_LENGTH 24 +#endif +#endif /* VEC_SET == AVX_512 */ + +//#if VEC_SET != AVX_512 if (nq-i == ROW_LENGTH) { CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq); worked_on += ROW_LENGTH; } +//#else +// if (nq-i == ROW_LENGTH) +// { +//#ifdef DOUBLE_PRECISION_COMPLEX +// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i], hh, nb, ldq); +// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+4], hh, nb, ldq); +// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+8], hh, nb, ldq); +//#endif +//#ifdef SINGLE_PRECISION_COMPLEX +// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i], hh, nb, ldq); +// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+8], hh, nb, ldq); +// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+16], hh, nb, ldq); +//#endif +// worked_on += ROW_LENGTH; +// } +//#endif #if VEC_SET == SSE_128 #undef ROW_LENGTH @@ -724,12 +1015,36 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D #endif #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 +#undef ROW_LENGTH +#ifdef DOUBLE_PRECISION_COMPLEX +#define ROW_LENGTH 8 +#endif +#ifdef SINGLE_PRECISION_COMPLEX +#define ROW_LENGTH 16 +#endif +#endif /* VEC_SET == AVX_512 */ +//#if VEC_SET != AVX_512 if (nq-i == ROW_LENGTH) { CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq); worked_on += ROW_LENGTH; } +//#else +// if (nq-i == ROW_LENGTH) +// { +//#ifdef DOUBLE_PRECISION_COMPLEX +// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i], hh, nb, ldq); +// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i+4], hh, nb, ldq); +//#endif +//#ifdef SINGLE_PRECISION_COMPLEX +// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i], hh, nb, ldq); +// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i+8], hh, nb, ldq); +//#endif +// worked_on += ROW_LENGTH; +// } +//#endif #if VEC_SET == SSE_128 #undef ROW_LENGTH @@ -750,11 +1065,35 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D #define ROW_LENGTH 4 #endif #endif /* VEC_SET == AVX_256 */ + +#if VEC_SET == AVX_512 +#undef ROW_LENGTH +#ifdef DOUBLE_PRECISION_COMPLEX +#define ROW_LENGTH 4 +#endif +#ifdef SINGLE_PRECISION_COMPLEX +#define ROW_LENGTH 8 +#endif +#endif /* VEC_SET == AVX_512 */ + +//#if VEC_SET != AVX_512 if (nq-i == ROW_LENGTH) { CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq); worked_on += ROW_LENGTH; } +//#else +// if (nq-i == ROW_LENGTH) +// { +//#ifdef DOUBLE_PRECISION_COMPLEX +// hh_trafo_complex_kernel_4_AVX512_1hv_double (&q[i], hh, nb, ldq); +//#endif +//#ifdef SINGLE_PRECISION_COMPLEX +// hh_trafo_complex_kernel_8_AVX512_1hv_single (&q[i], hh, nb, ldq); +//#endif +// worked_on += ROW_LENGTH; +// } +//#endif #endif /* BLOCK1 */ @@ -908,6 +1247,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (D #endif #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 +#ifdef DOUBLE_PRECISION_COMPLEX +#define ROW_LENGTH 24 +#endif +#ifdef SINGLE_PRECISION_COMPLEX +#define ROW_LENGTH 48 +#endif +#endif /* VEC_SET == AVX_512 */ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq #ifdef BLOCK1 ) @@ -951,6 +1298,15 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM #endif #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 +#ifdef DOUBLE_PRECISION_COMPLEX + __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); +#endif +#ifdef SINGLE_PRECISION_COMPLEX + __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000); +#endif +#endif /* VEC_SET == AVX_512 */ + #ifdef BLOCK2 x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]); x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]); @@ -1056,6 +1412,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]); #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 + h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]); + h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]); +#endif /* VEC_SET == AVX_512 */ + #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _SIMD_XOR(h1_imag, sign); @@ -1259,8 +1620,34 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM h1_imag = _SIMD_BROADCAST(&hh_dbl[1]); #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 + h1_real = _SIMD_SET1(hh_dbl[0]); + h1_imag = _SIMD_SET1(hh_dbl[1]); + +#ifdef HAVE_AVX512_XEON_PHI +#ifdef DOUBLE_PRECISION_COMPLEX + h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign); + h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign); +#endif +#ifdef SINGLE_PRECISION_COMPLEX + h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign); + h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign); +#endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) + h1_real = _SIMD_XOR(h1_real, sign); + h1_imag = _SIMD_XOR(h1_imag, sign); +#endif +#endif + +#endif /* VEC_SET == AVX_512 */ + + +#if VEC_SET != AVX_512 h1_real = _SIMD_XOR(h1_real, sign); h1_imag = _SIMD_XOR(h1_imag, sign); +#endif /* VEC_SET != AVX_512 */ tmp1 = _SIMD_MUL(h1_imag, x1); #ifdef __ELPA_USE_FMA__ @@ -1589,6 +1976,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]); #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 + h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]); + h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]); +#endif /* VEC_SET == AVX_512 */ + q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]); q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]); q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]); @@ -1795,6 +2187,14 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM #endif #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 +#ifdef DOUBLE_PRECISION_COMPLEX +#define ROW_LENGTH 20 +#endif +#ifdef SINGLE_PRECISION_COMPLEX +#define ROW_LENGTH 40 +#endif +#endif /* VEC_SET == AVX_512 */ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq #ifdef BLOCK1 ) @@ -1838,6 +2238,15 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM #endif #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 +#ifdef DOUBLE_PRECISION_COMPLEX + __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); +#endif +#ifdef SINGLE_PRECISION_COMPLEX + __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000); +#endif +#endif /* VEC_SET == AVX_512 */ + #ifdef BLOCK2 x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]); x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]); @@ -1934,6 +2343,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]); #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 + h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]); + h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]); +#endif /* VEC_SET == AVX_512 */ + #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _SIMD_XOR(h1_imag, sign); @@ -2118,11 +2532,35 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM h1_imag = _SIMD_BROADCAST(&hh_dbl[1]); #endif /* AVX_256 */ +#if VEC_SET == AVX_512 + h1_real = _SIMD_SET1(hh_dbl[0]); + h1_imag = _SIMD_SET1(hh_dbl[1]); + +#ifdef HAVE_AVX512_XEON_PHI +#ifdef DOUBLE_PRECISION_COMPLEX + h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign); + h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign); +#endif +#ifdef SINGLE_PRECISION_COMPLEX + h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign); + h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign); +#endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) + h1_real = _SIMD_XOR(h1_real, sign); + h1_imag = _SIMD_XOR(h1_imag, sign); +#endif +#endif + +#endif /* VEC_SET == AVX_512 */ + +#if VEC_SET != AVX_512 h1_real = _SIMD_XOR(h1_real, sign); h1_imag = _SIMD_XOR(h1_imag, sign); +#endif /* VEC_SET != AVX_512 */ tmp1 = _SIMD_MUL(h1_imag, x1); - #ifdef __ELPA_USE_FMA__ x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #else @@ -2418,6 +2856,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]); #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 + h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]); + h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]); +#endif /* VEC_SET == AVX_512 */ + q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]); q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]); q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]); @@ -2601,6 +3044,14 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM #endif #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 +#ifdef DOUBLE_PRECISION_COMPLEX +#define ROW_LENGTH 16 +#endif +#ifdef SINGLE_PRECISION_COMPLEX +#define ROW_LENGTH 32 +#endif +#endif /* VEC_SET == AVX_512 */ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq #ifdef BLOCK1 ) @@ -2643,6 +3094,15 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM #endif #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 +#ifdef DOUBLE_PRECISION_COMPLEX + __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); +#endif +#ifdef SINGLE_PRECISION_COMPLEX + __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000); +#endif +#endif /* VEC_SET == AVX_512 */ + #ifdef BLOCK2 x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]); x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]); @@ -2730,6 +3190,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]); #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 + h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]); + h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]); +#endif /* VEC_SET == AVX_512 */ + #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _SIMD_XOR(h1_imag, sign); @@ -2893,11 +3358,35 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM h1_imag = _SIMD_BROADCAST(&hh_dbl[1]); #endif /* AVX_256 */ +#if VEC_SET == AVX_512 + h1_real = _SIMD_SET1(hh_dbl[0]); + h1_imag = _SIMD_SET1(hh_dbl[1]); + +#ifdef HAVE_AVX512_XEON_PHI +#ifdef DOUBLE_PRECISION_COMPLEX + h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign); + h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign); +#endif +#ifdef SINGLE_PRECISION_COMPLEX + h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign); + h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign); +#endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) + h1_real = _SIMD_XOR(h1_real, sign); + h1_imag = _SIMD_XOR(h1_imag, sign); +#endif +#endif + +#endif /* VEC_SET == AVX_512 */ + +#if VEC_SET != AVX_512 h1_real = _SIMD_XOR(h1_real, sign); h1_imag = _SIMD_XOR(h1_imag, sign); +#endif /* VEC_SET != AVX_512 */ tmp1 = _SIMD_MUL(h1_imag, x1); - #ifdef __ELPA_USE_FMA__ x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #else @@ -3164,6 +3653,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]); #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 + h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]); + h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]); +#endif /* VEC_SET == AVX_512 */ + q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]); q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]); q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]); @@ -3323,6 +3817,16 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM #define ROW_LENGTH 12 #endif #endif /* VEC_SET == AVX_256 */ + +#if VEC_SET == AVX_512 +#ifdef DOUBLE_PRECISION_COMPLEX +#define ROW_LENGTH 12 +#endif +#ifdef SINGLE_PRECISION_COMPLEX +#define ROW_LENGTH 24 +#endif +#endif /* VEC_SET == AVX_512 */ + static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq #ifdef BLOCK1 ) @@ -3365,6 +3869,15 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM #endif #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 +#ifdef DOUBLE_PRECISION_COMPLEX + __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); +#endif +#ifdef SINGLE_PRECISION_COMPLEX + __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000); +#endif +#endif /* VEC_SET == AVX_512 */ + #ifdef BLOCK2 x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]); x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]); @@ -3442,6 +3955,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]); #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 + h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]); + h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]); +#endif /* VEC_SET == AVX_512 */ + #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _SIMD_XOR(h1_imag, sign); @@ -3584,11 +4102,35 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM h1_imag = _SIMD_BROADCAST(&hh_dbl[1]); #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 + h1_real = _SIMD_SET1(hh_dbl[0]); + h1_imag = _SIMD_SET1(hh_dbl[1]); + +#ifdef HAVE_AVX512_XEON_PHI +#ifdef DOUBLE_PRECISION_COMPLEX + h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign); + h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign); +#endif +#ifdef SINGLE_PRECISION_COMPLEX + h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign); + h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign); +#endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) + h1_real = _SIMD_XOR(h1_real, sign); + h1_imag = _SIMD_XOR(h1_imag, sign); +#endif +#endif + +#endif /* VEC_SET == AVX_512 */ + +#if VEC_SET != AVX_512 h1_real = _SIMD_XOR(h1_real, sign); h1_imag = _SIMD_XOR(h1_imag, sign); +#endif /* VEC_SET != AVX_512 */ tmp1 = _SIMD_MUL(h1_imag, x1); - #ifdef __ELPA_USE_FMA__ x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); #else @@ -3821,6 +4363,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]); #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 + h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]); + h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]); +#endif /* VEC_SET == AVX_512 */ + q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]); q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]); q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]); @@ -3957,6 +4504,16 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM #define ROW_LENGTH 8 #endif #endif /* VEC_SET == AVX_256 */ + +#if VEC_SET == AVX_512 +#ifdef DOUBLE_PRECISION_COMPLEX +#define ROW_LENGTH 8 +#endif +#ifdef SINGLE_PRECISION_COMPLEX +#define ROW_LENGTH 16 +#endif +#endif /* VEC_SET == AVX_512 */ + static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq #ifdef BLOCK1 ) @@ -4000,6 +4557,15 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM #endif #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 +#ifdef DOUBLE_PRECISION_COMPLEX + __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); +#endif +#ifdef SINGLE_PRECISION_COMPLEX + __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000); +#endif +#endif /* VEC_SET == AVX_512 */ + #ifdef BLOCK2 x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]); x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]); @@ -4066,6 +4632,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]); #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 + h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]); + h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]); +#endif /* VEC_SET == AVX_512 */ + #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _SIMD_XOR(h1_imag, sign); @@ -4185,8 +4756,33 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM h1_imag = _SIMD_BROADCAST(&hh_dbl[1]); #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 + h1_real = _SIMD_SET1(hh_dbl[0]); + h1_imag = _SIMD_SET1(hh_dbl[1]); + +#ifdef HAVE_AVX512_XEON_PHI +#ifdef DOUBLE_PRECISION_COMPLEX + h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign); + h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign); +#endif +#ifdef SINGLE_PRECISION_COMPLEX + h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign); + h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign); +#endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) + h1_real = _SIMD_XOR(h1_real, sign); + h1_imag = _SIMD_XOR(h1_imag, sign); +#endif +#endif + +#endif /* VEC_SET == AVX_512 */ + +#if VEC_SET != AVX_512 h1_real = _SIMD_XOR(h1_real, sign); h1_imag = _SIMD_XOR(h1_imag, sign); +#endif /* VEC_SET != AVX_512 */ tmp1 = _SIMD_MUL(h1_imag, x1); #ifdef __ELPA_USE_FMA__ @@ -4381,6 +4977,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]); #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 + h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]); + h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]); +#endif /* VEC_SET == AVX_512 */ + q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]); q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]); tmp1 = _SIMD_MUL(h1_imag, x1); @@ -4493,6 +5094,18 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM #define ROW_LENGTH 4 #endif #endif /* VEC_SET == AVX_256 */ + +#if VEC_SET == AVX_512 +#ifdef DOUBLE_PRECISION_COMPLEX +#define ROW_LENGTH 4 +#endif +#ifdef SINGLE_PRECISION_COMPLEX +#define ROW_LENGTH 8 +#endif +#endif /* VEC_SET == AVX_512 */ + +//#if VEC_SET != AVX_512 + static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq #ifdef BLOCK1 ) @@ -4536,6 +5149,15 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM #endif #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 +#ifdef DOUBLE_PRECISION_COMPLEX + __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); +#endif +#ifdef SINGLE_PRECISION_COMPLEX + __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000); +#endif +#endif /* VEC_SET == AVX_512 */ + #ifdef BLOCK2 x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]); @@ -4593,6 +5215,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]); #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 + h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]); + h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]); +#endif /* VEC_SET == AVX_512 */ + #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _SIMD_XOR(h1_imag, sign); @@ -4691,8 +5318,33 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM h1_imag = _SIMD_BROADCAST(&hh_dbl[1]); #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 + h1_real = _SIMD_SET1(hh_dbl[0]); + h1_imag = _SIMD_SET1(hh_dbl[1]); + +#ifdef HAVE_AVX512_XEON_PHI +#ifdef DOUBLE_PRECISION_COMPLEX + h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign); + h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign); +#endif +#ifdef SINGLE_PRECISION_COMPLEX + h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign); + h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign); +#endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) + h1_real = _SIMD_XOR(h1_real, sign); + h1_imag = _SIMD_XOR(h1_imag, sign); +#endif +#endif + +#endif /* VEC_SET == AVX_512 */ + +#if VEC_SET != AVX_512 h1_real = _SIMD_XOR(h1_real, sign); h1_imag = _SIMD_XOR(h1_imag, sign); +#endif /* VEC_SET != AVX_512 */ tmp1 = _SIMD_MUL(h1_imag, x1); #ifdef __ELPA_USE_FMA__ @@ -4854,6 +5506,11 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]); #endif /* VEC_SET == AVX_256 */ +#if VEC_SET == AVX_512 + h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]); + h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]); +#endif /* VEC_SET == AVX_512 */ + q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]); tmp1 = _SIMD_MUL(h1_imag, x1); @@ -4923,3 +5580,93 @@ static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIM #endif /* BLOCK2 */ } +//#endif + +#if 0 + +#ifdef DOUBLE_PRECISION_COMPLEX +static __forceinline void hh_trafo_complex_kernel_4_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq) +#endif +#ifdef SINGLE_PRECISION_COMPLEX +static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq) +#endif +{ + printf("calling it \n" ); + +#ifdef DOUBLE_PRECISION_COMPLEX + double* q_dbl = (double*)q; + double* hh_dbl = (double*)hh; +#endif +#ifdef SINGLE_PRECISION_COMPLEX + float* q_dbl = (float*)q; + float* hh_dbl = (float*)hh; +#endif + __SIMD_DATATYPE x1, x2; + __SIMD_DATATYPE q1, q2; + __SIMD_DATATYPE h1_real, h1_imag; + __SIMD_DATATYPE tmp1, tmp2; + int i=0; + +#ifdef DOUBLE_PRECISION_COMPLEX + __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); +#endif +#ifdef SINGLE_PRECISION_COMPLEX + __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000); +#endif + + x1 = _SIMD_LOAD(&q_dbl[0]); + + for (i = 1; i < nb; i++) + { + h1_real = _SIMD_SET1(hh_dbl[i*2]); + h1_imag = _SIMD_SET1(hh_dbl[(i*2)+1]); + + q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]); + + tmp1 = _SIMD_MUL(h1_imag, q1); + x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + } + + h1_real = _SIMD_SET1(hh_dbl[0]); + h1_imag = _SIMD_SET1(hh_dbl[1]); + +#ifdef HAVE_AVX512_XEON_PHI +#ifdef DOUBLE_PRECISION_COMPLEX + h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign); + h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign); +#endif +#ifdef SINGLE_PRECISION_COMPLEX + h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign); + h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign); +#endif +#endif +#ifdef HAVE_AVX512_XEON +#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) + h1_real = _SIMD_XOR(h1_real, sign); + h1_imag = _SIMD_XOR(h1_imag, sign); +#endif +#endif + + tmp1 = _SIMD_MUL(h1_imag, x1); + x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)); + + q1 = _SIMD_LOAD(&q_dbl[0]); + + q1 = _SIMD_ADD(q1, x1); + + _SIMD_STORE(&q_dbl[0], q1); + + for (i = 1; i < nb; i++) + { + h1_real = _SIMD_SET1(hh_dbl[i*2]); + h1_imag = _SIMD_SET1(hh_dbl[(i*2)+1]); + + q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]); + + tmp1 = _SIMD_MUL(h1_imag, x1); + q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE))); + + _SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1); + } +} +#endif diff --git a/src/elpa2/kernels/complex_avx512_1hv_double_precision.c b/src/elpa2/kernels/complex_avx512_1hv_double_precision.c index f64f0fec3e10ac31ff31c9bae8d9de25e5a22285..c87a59d8998fa10ccda853294193816356f46ef4 100644 --- a/src/elpa2/kernels/complex_avx512_1hv_double_precision.c +++ b/src/elpa2/kernels/complex_avx512_1hv_double_precision.c @@ -48,8 +48,12 @@ #define COMPLEXCASE 1 #define DOUBLE_PRECISION 1 +#define BLOCK1 1 +#define VEC_SET AVX_512 #include "../../general/precision_macros.h" -#include "complex_avx512_1hv_template.c" +#include "complex_128bit_256bit_512bit_BLOCK_template.c" +#undef VEC_SET +#undef BLOCK1 #undef DOUBLE_PRECISION #undef COMPLEXCASE diff --git a/src/elpa2/kernels/complex_avx512_1hv_single_precision.c b/src/elpa2/kernels/complex_avx512_1hv_single_precision.c index fe71cc6bb6c3ddef6ec33e000a60c70e509bdfc7..cd0f7573f97d63149f7ac392e2e7b11ef338fa25 100644 --- a/src/elpa2/kernels/complex_avx512_1hv_single_precision.c +++ b/src/elpa2/kernels/complex_avx512_1hv_single_precision.c @@ -48,8 +48,12 @@ #define COMPLEXCASE 1 #define SINGLE_PRECISION 1 +#define BLOCK1 1 +#define VEC_SET AVX_512 #include "../../general/precision_macros.h" -#include "complex_avx512_1hv_template.c" +#include "complex_128bit_256bit_512bit_BLOCK_template.c" +#undef VEC_SET +#undef BLOCK1 #undef SINGLE_PRECISION #undef COMPLEXCASE diff --git a/src/elpa2/kernels/complex_avx512_1hv_template.c b/src/elpa2/kernels/complex_avx512_1hv_template.c deleted file mode 100644 index 5d04b26cb5c4b8ef55264ec4b0ddafda48be1e1e..0000000000000000000000000000000000000000 --- a/src/elpa2/kernels/complex_avx512_1hv_template.c +++ /dev/null @@ -1,1111 +0,0 @@ -// This file is part of ELPA. -// -// The ELPA library was originally created by the ELPA consortium, -// consisting of the following organizations: -// -// - Max Planck Computing and Data Facility (MPCDF), formerly known as -// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), -// - Bergische Universität Wuppertal, Lehrstuhl für angewandte -// Informatik, -// - Technische Universität München, Lehrstuhl für Informatik mit -// Schwerpunkt Wissenschaftliches Rechnen , -// - Fritz-Haber-Institut, Berlin, Abt. Theorie, -// - Max-Plack-Institut für Mathematik in den Naturwissenschaften, -// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, -// and -// - IBM Deutschland GmbH -// -// This particular source code file contains additions, changes and -// enhancements authored by Intel Corporation which is not part of -// the ELPA consortium. -// -// More information can be found here: -// http://elpa.mpcdf.mpg.de/ -// -// ELPA is free software: you can redistribute it and/or modify -// it under the terms of the version 3 of the license of the -// GNU Lesser General Public License as published by the Free -// Software Foundation. -// -// ELPA is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with ELPA. If not, see -// -// ELPA reflects a substantial effort on the part of the original -// ELPA consortium, and we ask you to respect the spirit of the -// license that we chose: i.e., please contribute any changes you -// may have back to the original ELPA library distribution, and keep -// any derivatives of ELPA under the same license that we chose for -// the original distribution, the GNU Lesser General Public License. -// -// Author: Andreas Marek (andreas.marek@mpcdf.mpg.de) -// -------------------------------------------------------------------------------------------------- - - -#include "config-f90.h" - -#include -#include -#include -#include - -#define __forceinline __attribute__((always_inline)) - -#ifdef DOUBLE_PRECISION_COMPLEX -#define __AVX512_DATATYPE __m512d -#define _AVX512_LOAD _mm512_load_pd -#define _AVX512_STORE _mm512_store_pd -#define _AVX512_SET1 _mm512_set1_pd -#define _AVX512_MUL _mm512_mul_pd -#define _AVX512_ADD _mm512_add_pd -#define _AVX512_SHUFFLE _mm512_shuffle_pd -#ifdef HAVE_AVX512_XEON -#define _AVX512_XOR _mm512_xor_pd -#endif -#define _AVX512_XOR_EPI _mm512_xor_epi64 -#define _SHUFFLE 0x55 - -#ifdef HAVE_AVX512 - -#define __ELPA_USE_FMA__ -#define _mm512_FMADDSUB_pd(a,b,c) _mm512_fmaddsub_pd(a,b,c) -#define _mm512_FMSUBADD_pd(a,b,c) _mm512_fmsubadd_pd(a,b,c) - -#endif - -#define _AVX512_FMADDSUB _mm512_FMADDSUB_pd -#define _AVX512_FMSUBADD _mm512_FMSUBADD_pd -#endif /* DOUBLE_PRECISION_COMPLEX */ - -#ifdef SINGLE_PRECISION_COMPLEX -#define __AVX512_DATATYPE __m512 -#define _AVX512_LOAD _mm512_load_ps -#define _AVX512_STORE _mm512_store_ps -#define _AVX512_SET1 _mm512_set1_ps -#define _AVX512_MUL _mm512_mul_ps -#define _AVX512_ADD _mm512_add_ps -#define _AVX512_SHUFFLE _mm512_shuffle_ps -#ifdef HAVE_AVX512_XEON -#define _AVX512_XOR _mm512_xor_ps -#endif -#define _AVX512_XOR_EPI _mm512_xor_epi32 -#define _SHUFFLE 0xb1 - -#ifdef HAVE_AVX512 - -#define __ELPA_USE_FMA__ -#define _mm512_FMADDSUB_ps(a,b,c) _mm512_fmaddsub_ps(a,b,c) -#define _mm512_FMSUBADD_ps(a,b,c) _mm512_fmsubadd_ps(a,b,c) - -#endif - -#define _AVX512_FMADDSUB _mm512_FMADDSUB_ps -#define _AVX512_FMSUBADD _mm512_FMSUBADD_ps -#endif /* SINGLE_PRECISION_COMPLEX */ - - -//Forward declaration -#ifdef DOUBLE_PRECISION_COMPLEX -static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq); -static __forceinline void hh_trafo_complex_kernel_20_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq); -static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq); -static __forceinline void hh_trafo_complex_kernel_12_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq); -static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq); -static __forceinline void hh_trafo_complex_kernel_4_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq); -#endif - -#ifdef SINGLE_PRECISION_COMPLEX -static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq); -static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq); -static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq); -static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq); -static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq); -static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq); -#endif - - -/* -!f>#if defined(HAVE_AVX512) -!f> interface -!f> subroutine single_hh_trafo_complex_avx512_1hv_double(q, hh, pnb, pnq, pldq) & -!f> bind(C, name="single_hh_trafo_complex_avx512_1hv_double") -!f> use, intrinsic :: iso_c_binding -!f> integer(kind=c_int) :: pnb, pnq, pldq -!f> ! complex(kind=c_double_complex) :: q(*) -!f> type(c_ptr), value :: q -!f> complex(kind=c_double_complex) :: hh(pnb,2) -!f> end subroutine -!f> end interface -!f>#endif -*/ -/* -!f>#if defined(HAVE_AVX512) -!f> interface -!f> subroutine single_hh_trafo_complex_avx512_1hv_single(q, hh, pnb, pnq, pldq) & -!f> bind(C, name="single_hh_trafo_complex_avx512_1hv_single") -!f> use, intrinsic :: iso_c_binding -!f> integer(kind=c_int) :: pnb, pnq, pldq -!f> ! complex(kind=c_float_complex) :: q(*) -!f> type(c_ptr), value :: q -!f> complex(kind=c_float_complex) :: hh(pnb,2) -!f> end subroutine -!f> end interface -!f>#endif -*/ - -#ifdef DOUBLE_PRECISION_COMPLEX -void single_hh_trafo_complex_avx512_1hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq) -#endif -#ifdef SINGLE_PRECISION_COMPLEX -void single_hh_trafo_complex_avx512_1hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq) -#endif -{ - int i; - int nb = *pnb; - int nq = *pldq; - int ldq = *pldq; - int worked_on; - //int ldh = *pldh; - - worked_on = 0; - -#ifdef DOUBLE_PRECISION_COMPLEX - for (i = 0; i < nq-20; i+=24) - { - hh_trafo_complex_kernel_24_AVX512_1hv_double(&q[i], hh, nb, ldq); - worked_on += 24; - } -#endif - -#ifdef SINGLE_PRECISION_COMPLEX - for (i = 0; i < nq-40; i+=48) - { - hh_trafo_complex_kernel_48_AVX512_1hv_single(&q[i], hh, nb, ldq); - worked_on += 48; - } -#endif - if (nq == i) - { - return; - } - -#ifdef DOUBLE_PRECISION_COMPLEX - if (nq-i == 20) - { - hh_trafo_complex_kernel_20_AVX512_1hv_double(&q[i], hh, nb, ldq); - worked_on += 20; - } -#endif - -#ifdef SINGLE_PRECISION_COMPLEX - if (nq-i == 40) - { - hh_trafo_complex_kernel_40_AVX512_1hv_single(&q[i], hh, nb, ldq); - worked_on += 40; - } -#endif - -#ifdef DOUBLE_PRECISION_COMPLEX - if (nq-i == 16) - { - hh_trafo_complex_kernel_16_AVX512_1hv_double(&q[i], hh, nb, ldq); - worked_on += 16; - } -#endif - -#ifdef SINGLE_PRECISION_COMPLEX - if (nq-i == 32) - { - hh_trafo_complex_kernel_32_AVX512_1hv_single(&q[i], hh, nb, ldq); - worked_on += 32; - } -#endif - -#ifdef DOUBLE_PRECISION_COMPLEX - if (nq-i == 12) - { - hh_trafo_complex_kernel_12_AVX512_1hv_double(&q[i], hh, nb, ldq); - worked_on += 12; - } -#endif - -#ifdef SINGLE_PRECISION_COMPLEX - if (nq-i == 24) - { - hh_trafo_complex_kernel_24_AVX512_1hv_single(&q[i], hh, nb, ldq); - worked_on += 24; - } -#endif - -#ifdef DOUBLE_PRECISION_COMPLEX - if (nq-i == 8) - { - hh_trafo_complex_kernel_8_AVX512_1hv_double(&q[i], hh, nb, ldq); - worked_on += 8; - } -#endif - -#ifdef SINGLE_PRECISION_COMPLEX - if (nq-i == 16) - { - hh_trafo_complex_kernel_16_AVX512_1hv_single(&q[i], hh, nb, ldq); - worked_on += 16; - } -#endif - -#ifdef DOUBLE_PRECISION_COMPLEX - if (nq-i == 4) - { - hh_trafo_complex_kernel_4_AVX512_1hv_double(&q[i], hh, nb, ldq); - worked_on += 4; - } -#endif - -#ifdef SINGLE_PRECISION_COMPLEX - if (nq-i == 8) - { - hh_trafo_complex_kernel_8_AVX512_1hv_single(&q[i], hh, nb, ldq); - worked_on += 8; - } -#endif -#ifdef WITH_DEBUG - if (worked_on != nq) - { - printf("Error in complex AVX512 BLOCK 1 kernel \n"); - abort(); - } -#endif -} - -#ifdef DOUBLE_PRECISION_COMPLEX -static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq) -#endif -#ifdef SINGLE_PRECISION_COMPLEX -static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq) -#endif -{ - -#ifdef DOUBLE_PRECISION_COMPLEX - double* q_dbl = (double*)q; - double* hh_dbl = (double*)hh; -#endif -#ifdef SINGLE_PRECISION_COMPLEX - float* q_dbl = (float*)q; - float* hh_dbl = (float*)hh; -#endif - __AVX512_DATATYPE x1, x2, x3, x4, x5, x6; - __AVX512_DATATYPE q1, q2, q3, q4, q5, q6; - __AVX512_DATATYPE h1_real, h1_imag; - __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; - int i=0; - -#ifdef DOUBLE_PRECISION_COMPLEX - __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); -#endif -#ifdef SINGLE_PRECISION_COMPLEX - __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000); -#endif - -#ifdef DOUBLE_PRECISION_COMPLEX -#define offset 8 -#endif -#ifdef SINGLE_PRECISION_COMPLEX -#define offset 16 -#endif - - - x1 = _AVX512_LOAD(&q_dbl[0]); // complex 1, 2, 3, 4 - x2 = _AVX512_LOAD(&q_dbl[offset]); // complex 5, 6, 7, 8 - x3 = _AVX512_LOAD(&q_dbl[2*offset]); // complex 9, 10, 11, 12 - x4 = _AVX512_LOAD(&q_dbl[3*offset]); // complex 13, 14, 15, 16 - x5 = _AVX512_LOAD(&q_dbl[4*offset]); // complex 17, 18, 19, 20 - x6 = _AVX512_LOAD(&q_dbl[5*offset]); // complex 21, 22, 23, 24 - - for (i = 1; i < nb; i++) - { - h1_real = _AVX512_SET1(hh_dbl[i*2]); - h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]); - - q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]); - q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]); - q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]); - q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]); - q5 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+4*offset]); - q6 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+5*offset]); - - tmp1 = _AVX512_MUL(h1_imag, q1); - - x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE))); - - tmp2 = _AVX512_MUL(h1_imag, q2); - - x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE))); - - tmp3 = _AVX512_MUL(h1_imag, q3); - - x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE))); - - tmp4 = _AVX512_MUL(h1_imag, q4); - - x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE))); - - tmp5 = _AVX512_MUL(h1_imag, q5); - - x5 = _AVX512_ADD(x5, _AVX512_FMSUBADD(h1_real, q5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE))); - - tmp6 = _AVX512_MUL(h1_imag, q6); - - x6 = _AVX512_ADD(x6, _AVX512_FMSUBADD(h1_real, q6, _AVX512_SHUFFLE(tmp6, tmp6, _SHUFFLE))); - } - - h1_real = _AVX512_SET1(hh_dbl[0]); - h1_imag = _AVX512_SET1(hh_dbl[1]); - -#ifdef HAVE_AVX512_XEON_PHI -#ifdef DOUBLE_PRECISION_COMPLEX - h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); - h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); -#endif -#ifdef SINGLE_PRECISION_COMPLEX - h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); - h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); -#endif -#endif -#ifdef HAVE_AVX512_XEON -#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) - h1_real = _AVX512_XOR(h1_real, sign); - h1_imag = _AVX512_XOR(h1_imag, sign); -#endif -#endif - - tmp1 = _AVX512_MUL(h1_imag, x1); - - x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)); - - tmp2 = _AVX512_MUL(h1_imag, x2); - - x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)); - - tmp3 = _AVX512_MUL(h1_imag, x3); - - x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)); - - tmp4 = _AVX512_MUL(h1_imag, x4); - - x4 = _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)); - - tmp5 = _AVX512_MUL(h1_imag, x5); - - x5 = _AVX512_FMADDSUB(h1_real, x5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE)); - - tmp6 = _AVX512_MUL(h1_imag, x6); - - x6 = _AVX512_FMADDSUB(h1_real, x6, _AVX512_SHUFFLE(tmp6, tmp6, _SHUFFLE)); - - q1 = _AVX512_LOAD(&q_dbl[0]); - q2 = _AVX512_LOAD(&q_dbl[offset]); - q3 = _AVX512_LOAD(&q_dbl[2*offset]); - q4 = _AVX512_LOAD(&q_dbl[3*offset]); - q5 = _AVX512_LOAD(&q_dbl[4*offset]); - q6 = _AVX512_LOAD(&q_dbl[5*offset]); - - q1 = _AVX512_ADD(q1, x1); - q2 = _AVX512_ADD(q2, x2); - q3 = _AVX512_ADD(q3, x3); - q4 = _AVX512_ADD(q4, x4); - q5 = _AVX512_ADD(q5, x5); - q6 = _AVX512_ADD(q6, x6); - - _AVX512_STORE(&q_dbl[0], q1); - _AVX512_STORE(&q_dbl[offset], q2); - _AVX512_STORE(&q_dbl[2*offset], q3); - _AVX512_STORE(&q_dbl[3*offset], q4); - _AVX512_STORE(&q_dbl[4*offset], q5); - _AVX512_STORE(&q_dbl[5*offset], q6); - - for (i = 1; i < nb; i++) - { - h1_real = _AVX512_SET1(hh_dbl[i*2]); - h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]); - - q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]); - q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]); - q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]); - q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]); - q5 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+4*offset]); - q6 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+5*offset]); - - tmp1 = _AVX512_MUL(h1_imag, x1); - - q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE))); - - tmp2 = _AVX512_MUL(h1_imag, x2); - - q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE))); - - tmp3 = _AVX512_MUL(h1_imag, x3); - - q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE))); - - tmp4 = _AVX512_MUL(h1_imag, x4); - - q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE))); - - tmp5 = _AVX512_MUL(h1_imag, x5); - - q5 = _AVX512_ADD(q5, _AVX512_FMADDSUB(h1_real, x5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE))); - - tmp6 = _AVX512_MUL(h1_imag, x6); - - q6 = _AVX512_ADD(q6, _AVX512_FMADDSUB(h1_real, x6, _AVX512_SHUFFLE(tmp6, tmp6, _SHUFFLE))); - - _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1); - _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2); - _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3); - _AVX512_STORE(&q_dbl[(2*i*ldq)+3*offset], q4); - _AVX512_STORE(&q_dbl[(2*i*ldq)+4*offset], q5); - _AVX512_STORE(&q_dbl[(2*i*ldq)+5*offset], q6); - } -} - -#ifdef DOUBLE_PRECISION_COMPLEX -static __forceinline void hh_trafo_complex_kernel_20_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq) -#endif -#ifdef SINGLE_PRECISION_COMPLEX -static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq) -#endif -{ - -#ifdef DOUBLE_PRECISION_COMPLEX - double* q_dbl = (double*)q; - double* hh_dbl = (double*)hh; -#endif -#ifdef SINGLE_PRECISION_COMPLEX - float* q_dbl = (float*)q; - float* hh_dbl = (float*)hh; -#endif - __AVX512_DATATYPE x1, x2, x3, x4, x5, x6; - __AVX512_DATATYPE q1, q2, q3, q4, q5, q6; - __AVX512_DATATYPE h1_real, h1_imag; - __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; - int i=0; - -#ifdef DOUBLE_PRECISION_COMPLEX - __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); -#endif -#ifdef SINGLE_PRECISION_COMPLEX - __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000); -#endif - -#ifdef DOUBLE_PRECISION_COMPLEX -#define offset 8 -#endif -#ifdef SINGLE_PRECISION_COMPLEX -#define offset 16 -#endif - - - x1 = _AVX512_LOAD(&q_dbl[0]); // complex 1, 2, 3, 4 - x2 = _AVX512_LOAD(&q_dbl[offset]); // complex 5, 6, 7, 8 - x3 = _AVX512_LOAD(&q_dbl[2*offset]); // complex 9, 10, 11, 12 - x4 = _AVX512_LOAD(&q_dbl[3*offset]); // complex 13, 14, 15, 16 - x5 = _AVX512_LOAD(&q_dbl[4*offset]); // complex 17, 18, 19, 20 - - for (i = 1; i < nb; i++) - { - h1_real = _AVX512_SET1(hh_dbl[i*2]); - h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]); - - q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]); - q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]); - q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]); - q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]); - q5 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+4*offset]); - - tmp1 = _AVX512_MUL(h1_imag, q1); - - x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE))); - - tmp2 = _AVX512_MUL(h1_imag, q2); - - x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE))); - - tmp3 = _AVX512_MUL(h1_imag, q3); - - x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE))); - - tmp4 = _AVX512_MUL(h1_imag, q4); - - x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE))); - - tmp5 = _AVX512_MUL(h1_imag, q5); - - x5 = _AVX512_ADD(x5, _AVX512_FMSUBADD(h1_real, q5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE))); - - } - - h1_real = _AVX512_SET1(hh_dbl[0]); - h1_imag = _AVX512_SET1(hh_dbl[1]); - -#ifdef HAVE_AVX512_XEON_PHI -#ifdef DOUBLE_PRECISION_COMPLEX - h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); - h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); -#endif -#ifdef SINGLE_PRECISION_COMPLEX - h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); - h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); -#endif -#endif -#ifdef HAVE_AVX512_XEON -#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) - h1_real = _AVX512_XOR(h1_real, sign); - h1_imag = _AVX512_XOR(h1_imag, sign); -#endif -#endif - - tmp1 = _AVX512_MUL(h1_imag, x1); - - x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)); - - tmp2 = _AVX512_MUL(h1_imag, x2); - - x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)); - - tmp3 = _AVX512_MUL(h1_imag, x3); - - x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)); - - tmp4 = _AVX512_MUL(h1_imag, x4); - - x4 = _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)); - - tmp5 = _AVX512_MUL(h1_imag, x5); - - x5 = _AVX512_FMADDSUB(h1_real, x5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE)); - - q1 = _AVX512_LOAD(&q_dbl[0]); - q2 = _AVX512_LOAD(&q_dbl[offset]); - q3 = _AVX512_LOAD(&q_dbl[2*offset]); - q4 = _AVX512_LOAD(&q_dbl[3*offset]); - q5 = _AVX512_LOAD(&q_dbl[4*offset]); - - q1 = _AVX512_ADD(q1, x1); - q2 = _AVX512_ADD(q2, x2); - q3 = _AVX512_ADD(q3, x3); - q4 = _AVX512_ADD(q4, x4); - q5 = _AVX512_ADD(q5, x5); - - _AVX512_STORE(&q_dbl[0], q1); - _AVX512_STORE(&q_dbl[offset], q2); - _AVX512_STORE(&q_dbl[2*offset], q3); - _AVX512_STORE(&q_dbl[3*offset], q4); - _AVX512_STORE(&q_dbl[4*offset], q5); - - for (i = 1; i < nb; i++) - { - h1_real = _AVX512_SET1(hh_dbl[i*2]); - h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]); - - q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]); - q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]); - q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]); - q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]); - q5 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+4*offset]); - - tmp1 = _AVX512_MUL(h1_imag, x1); - - q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE))); - - tmp2 = _AVX512_MUL(h1_imag, x2); - - q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE))); - - tmp3 = _AVX512_MUL(h1_imag, x3); - - q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE))); - - tmp4 = _AVX512_MUL(h1_imag, x4); - - q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE))); - - tmp5 = _AVX512_MUL(h1_imag, x5); - - q5 = _AVX512_ADD(q5, _AVX512_FMADDSUB(h1_real, x5, _AVX512_SHUFFLE(tmp5, tmp5, _SHUFFLE))); - - _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1); - _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2); - _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3); - _AVX512_STORE(&q_dbl[(2*i*ldq)+3*offset], q4); - _AVX512_STORE(&q_dbl[(2*i*ldq)+4*offset], q5); - } -} - - -#ifdef DOUBLE_PRECISION_COMPLEX -static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq) -#endif -#ifdef SINGLE_PRECISION_COMPLEX -static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq) -#endif -{ - -#ifdef DOUBLE_PRECISION_COMPLEX - double* q_dbl = (double*)q; - double* hh_dbl = (double*)hh; -#endif -#ifdef SINGLE_PRECISION_COMPLEX - float* q_dbl = (float*)q; - float* hh_dbl = (float*)hh; -#endif - - __AVX512_DATATYPE x1, x2, x3, x4; - __AVX512_DATATYPE q1, q2, q3, q4; - __AVX512_DATATYPE h1_real, h1_imag; - __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4; - int i=0; - -#ifdef DOUBLE_PRECISION_COMPLEX - __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); -#endif -#ifdef SINGLE_PRECISION_COMPLEX - __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000); -#endif - - x1 = _AVX512_LOAD(&q_dbl[0]); // complex 1 2 3 4 - x2 = _AVX512_LOAD(&q_dbl[offset]); - x3 = _AVX512_LOAD(&q_dbl[2*offset]); - x4 = _AVX512_LOAD(&q_dbl[3*offset]); // comlex 13 14 15 16 - - for (i = 1; i < nb; i++) - { - h1_real = _AVX512_SET1(hh_dbl[i*2]); - h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]); - - q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]); - q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]); - q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]); - q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]); - - tmp1 = _AVX512_MUL(h1_imag, q1); - - x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE))); - - tmp2 = _AVX512_MUL(h1_imag, q2); - - x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE))); - - tmp3 = _AVX512_MUL(h1_imag, q3); - - x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE))); - - tmp4 = _AVX512_MUL(h1_imag, q4); - - x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE))); - } - - h1_real = _AVX512_SET1(hh_dbl[0]); - h1_imag = _AVX512_SET1(hh_dbl[1]); - -#ifdef HAVE_AVX512_XEON_PHI -#ifdef DOUBLE_PRECISION_COMPLEX - h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); - h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); -#endif -#ifdef SINGLE_PRECISION_COMPLEX - h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); - h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); -#endif -#endif -#ifdef HAVE_AVX512_XEON -#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) - h1_real = _AVX512_XOR(h1_real, sign); - h1_imag = _AVX512_XOR(h1_imag, sign); -#endif -#endif - - tmp1 = _AVX512_MUL(h1_imag, x1); - - x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)); - - tmp2 = _AVX512_MUL(h1_imag, x2); - - x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)); - - tmp3 = _AVX512_MUL(h1_imag, x3); - - x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)); - - tmp4 = _AVX512_MUL(h1_imag, x4); - - x4 = _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)); - - q1 = _AVX512_LOAD(&q_dbl[0]); - q2 = _AVX512_LOAD(&q_dbl[offset]); - q3 = _AVX512_LOAD(&q_dbl[2*offset]); - q4 = _AVX512_LOAD(&q_dbl[3*offset]); - - q1 = _AVX512_ADD(q1, x1); - q2 = _AVX512_ADD(q2, x2); - q3 = _AVX512_ADD(q3, x3); - q4 = _AVX512_ADD(q4, x4); - - _AVX512_STORE(&q_dbl[0], q1); - _AVX512_STORE(&q_dbl[offset], q2); - _AVX512_STORE(&q_dbl[2*offset], q3); - _AVX512_STORE(&q_dbl[3*offset], q4); - - for (i = 1; i < nb; i++) - { - h1_real = _AVX512_SET1(hh_dbl[i*2]); - h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]); - - q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]); - q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]); - q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]); - q4 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+3*offset]); - - tmp1 = _AVX512_MUL(h1_imag, x1); - - q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE))); - - tmp2 = _AVX512_MUL(h1_imag, x2); - - q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE))); - - tmp3 = _AVX512_MUL(h1_imag, x3); - - q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE))); - - tmp4 = _AVX512_MUL(h1_imag, x4); - - q4 = _AVX512_ADD(q4, _AVX512_FMADDSUB(h1_real, x4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE))); - - _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1); - _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2); - _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3); - _AVX512_STORE(&q_dbl[(2*i*ldq)+3*offset], q4); - } -} - -#ifdef DOUBLE_PRECISION_COMPLEX -static __forceinline void hh_trafo_complex_kernel_12_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq) -#endif -#ifdef SINGLE_PRECISION_COMPLEX -static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq) -#endif -{ - -#ifdef DOUBLE_PRECISION_COMPLEX - double* q_dbl = (double*)q; - double* hh_dbl = (double*)hh; -#endif -#ifdef SINGLE_PRECISION_COMPLEX - float* q_dbl = (float*)q; - float* hh_dbl = (float*)hh; -#endif - - __AVX512_DATATYPE x1, x2, x3, x4; - __AVX512_DATATYPE q1, q2, q3, q4; - __AVX512_DATATYPE h1_real, h1_imag; - __AVX512_DATATYPE tmp1, tmp2, tmp3, tmp4; - int i=0; - -#ifdef DOUBLE_PRECISION_COMPLEX - __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); -#endif -#ifdef SINGLE_PRECISION_COMPLEX - __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000); -#endif - - x1 = _AVX512_LOAD(&q_dbl[0]); // complex 1 2 3 4 - x2 = _AVX512_LOAD(&q_dbl[offset]); - x3 = _AVX512_LOAD(&q_dbl[2*offset]); - - for (i = 1; i < nb; i++) - { - h1_real = _AVX512_SET1(hh_dbl[i*2]); - h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]); - - q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]); - q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]); - q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]); - - tmp1 = _AVX512_MUL(h1_imag, q1); - - x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE))); - - tmp2 = _AVX512_MUL(h1_imag, q2); - - x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE))); - - tmp3 = _AVX512_MUL(h1_imag, q3); - - x3 = _AVX512_ADD(x3, _AVX512_FMSUBADD(h1_real, q3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE))); - - } - - h1_real = _AVX512_SET1(hh_dbl[0]); - h1_imag = _AVX512_SET1(hh_dbl[1]); - -#ifdef HAVE_AVX512_XEON_PHI -#ifdef DOUBLE_PRECISION_COMPLEX - h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); - h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); -#endif -#ifdef SINGLE_PRECISION_COMPLEX - h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); - h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); -#endif -#endif -#ifdef HAVE_AVX512_XEON -#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) - h1_real = _AVX512_XOR(h1_real, sign); - h1_imag = _AVX512_XOR(h1_imag, sign); -#endif -#endif - - tmp1 = _AVX512_MUL(h1_imag, x1); - - x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)); - - tmp2 = _AVX512_MUL(h1_imag, x2); - - x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)); - - tmp3 = _AVX512_MUL(h1_imag, x3); - - x3 = _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE)); - - q1 = _AVX512_LOAD(&q_dbl[0]); - q2 = _AVX512_LOAD(&q_dbl[offset]); - q3 = _AVX512_LOAD(&q_dbl[2*offset]); - - q1 = _AVX512_ADD(q1, x1); - q2 = _AVX512_ADD(q2, x2); - q3 = _AVX512_ADD(q3, x3); - - _AVX512_STORE(&q_dbl[0], q1); - _AVX512_STORE(&q_dbl[offset], q2); - _AVX512_STORE(&q_dbl[2*offset], q3); - - for (i = 1; i < nb; i++) - { - h1_real = _AVX512_SET1(hh_dbl[i*2]); - h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]); - - q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]); - q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]); - q3 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+2*offset]); - - tmp1 = _AVX512_MUL(h1_imag, x1); - - q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE))); - - tmp2 = _AVX512_MUL(h1_imag, x2); - - q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE))); - - tmp3 = _AVX512_MUL(h1_imag, x3); - - q3 = _AVX512_ADD(q3, _AVX512_FMADDSUB(h1_real, x3, _AVX512_SHUFFLE(tmp3, tmp3, _SHUFFLE))); - - _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1); - _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2); - _AVX512_STORE(&q_dbl[(2*i*ldq)+2*offset], q3); - } -} - - -#ifdef DOUBLE_PRECISION_COMPLEX -static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq) -#endif -#ifdef SINGLE_PRECISION_COMPLEX -static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq) -#endif -{ - -#ifdef DOUBLE_PRECISION_COMPLEX - double* q_dbl = (double*)q; - double* hh_dbl = (double*)hh; -#endif -#ifdef SINGLE_PRECISION_COMPLEX - float* q_dbl = (float*)q; - float* hh_dbl = (float*)hh; -#endif - __AVX512_DATATYPE x1, x2; - __AVX512_DATATYPE q1, q2; - __AVX512_DATATYPE h1_real, h1_imag; - __AVX512_DATATYPE tmp1, tmp2; - int i=0; - -#ifdef DOUBLE_PRECISION_COMPLEX - __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); -#endif -#ifdef SINGLE_PRECISION_COMPLEX - __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000); -#endif - - x1 = _AVX512_LOAD(&q_dbl[0]); - x2 = _AVX512_LOAD(&q_dbl[offset]); - - for (i = 1; i < nb; i++) - { - h1_real = _AVX512_SET1(hh_dbl[i*2]); - h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]); - - q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]); - q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]); - - tmp1 = _AVX512_MUL(h1_imag, q1); - x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE))); - tmp2 = _AVX512_MUL(h1_imag, q2); - x2 = _AVX512_ADD(x2, _AVX512_FMSUBADD(h1_real, q2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE))); - } - - h1_real = _AVX512_SET1(hh_dbl[0]); - h1_imag = _AVX512_SET1(hh_dbl[1]); - -#ifdef HAVE_AVX512_XEON_PHI -#ifdef DOUBLE_PRECISION_COMPLEX - h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); - h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); -#endif -#ifdef SINGLE_PRECISION_COMPLEX - h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); - h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); -#endif -#endif -#ifdef HAVE_AVX512_XEON -#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) - h1_real = _AVX512_XOR(h1_real, sign); - h1_imag = _AVX512_XOR(h1_imag, sign); -#endif -#endif - - tmp1 = _AVX512_MUL(h1_imag, x1); - x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)); - - tmp2 = _AVX512_MUL(h1_imag, x2); - - x2 = _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE)); - - q1 = _AVX512_LOAD(&q_dbl[0]); - q2 = _AVX512_LOAD(&q_dbl[offset]); - - q1 = _AVX512_ADD(q1, x1); - q2 = _AVX512_ADD(q2, x2); - - _AVX512_STORE(&q_dbl[0], q1); - _AVX512_STORE(&q_dbl[offset], q2); - - for (i = 1; i < nb; i++) - { - h1_real = _AVX512_SET1(hh_dbl[i*2]); - h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]); - - q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]); - q2 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+offset]); - - tmp1 = _AVX512_MUL(h1_imag, x1); - q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE))); - - tmp2 = _AVX512_MUL(h1_imag, x2); - - q2 = _AVX512_ADD(q2, _AVX512_FMADDSUB(h1_real, x2, _AVX512_SHUFFLE(tmp2, tmp2, _SHUFFLE))); - - _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1); - _AVX512_STORE(&q_dbl[(2*i*ldq)+offset], q2); - } -} - - -#ifdef DOUBLE_PRECISION_COMPLEX -static __forceinline void hh_trafo_complex_kernel_4_AVX512_1hv_double(double complex* q, double complex* hh, int nb, int ldq) -#endif -#ifdef SINGLE_PRECISION_COMPLEX -static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float complex* q, float complex* hh, int nb, int ldq) -#endif -{ - -#ifdef DOUBLE_PRECISION_COMPLEX - double* q_dbl = (double*)q; - double* hh_dbl = (double*)hh; -#endif -#ifdef SINGLE_PRECISION_COMPLEX - float* q_dbl = (float*)q; - float* hh_dbl = (float*)hh; -#endif - __AVX512_DATATYPE x1, x2; - __AVX512_DATATYPE q1, q2; - __AVX512_DATATYPE h1_real, h1_imag; - __AVX512_DATATYPE tmp1, tmp2; - int i=0; - -#ifdef DOUBLE_PRECISION_COMPLEX - __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set_epi64(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); -#endif -#ifdef SINGLE_PRECISION_COMPLEX - __AVX512_DATATYPE sign = (__AVX512_DATATYPE)_mm512_set1_epi32(0x80000000); -#endif - - x1 = _AVX512_LOAD(&q_dbl[0]); - - for (i = 1; i < nb; i++) - { - h1_real = _AVX512_SET1(hh_dbl[i*2]); - h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]); - - q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]); - - tmp1 = _AVX512_MUL(h1_imag, q1); - x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE))); - } - - h1_real = _AVX512_SET1(hh_dbl[0]); - h1_imag = _AVX512_SET1(hh_dbl[1]); - -#ifdef HAVE_AVX512_XEON_PHI -#ifdef DOUBLE_PRECISION_COMPLEX - h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); - h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); -#endif -#ifdef SINGLE_PRECISION_COMPLEX - h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign); - h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign); -#endif -#endif -#ifdef HAVE_AVX512_XEON -#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX) - h1_real = _AVX512_XOR(h1_real, sign); - h1_imag = _AVX512_XOR(h1_imag, sign); -#endif -#endif - - tmp1 = _AVX512_MUL(h1_imag, x1); - x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)); - - q1 = _AVX512_LOAD(&q_dbl[0]); - - q1 = _AVX512_ADD(q1, x1); - - _AVX512_STORE(&q_dbl[0], q1); - - for (i = 1; i < nb; i++) - { - h1_real = _AVX512_SET1(hh_dbl[i*2]); - h1_imag = _AVX512_SET1(hh_dbl[(i*2)+1]); - - q1 = _AVX512_LOAD(&q_dbl[(2*i*ldq)+0]); - - tmp1 = _AVX512_MUL(h1_imag, x1); - q1 = _AVX512_ADD(q1, _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE))); - - _AVX512_STORE(&q_dbl[(2*i*ldq)+0], q1); - } -} -