Commit ba9967d6 authored by Andreas Marek's avatar Andreas Marek

Also put 256bit 4hv kernel in unfied file

parent 135e3732
...@@ -939,7 +939,6 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \ ...@@ -939,7 +939,6 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_template.c \ @top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_sse_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_2hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_sse_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_bgp.f90 \ @top_srcdir@/src/elpa2/kernels/real_bgp.f90 \
@top_srcdir@/src/elpa2/kernels/real_sse_4hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_sse_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_6hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx512_6hv_double_precision.c \
......
...@@ -775,7 +775,6 @@ EXTRA_DIST = \ ...@@ -775,7 +775,6 @@ EXTRA_DIST = \
src/elpa2/kernels/complex_sse_1hv_template.c \ src/elpa2/kernels/complex_sse_1hv_template.c \
src/elpa2/kernels/complex_sse_2hv_template.c \ src/elpa2/kernels/complex_sse_2hv_template.c \
src/elpa2/kernels/complex_template.F90 \ src/elpa2/kernels/complex_template.F90 \
src/elpa2/kernels/real_avx-avx2_4hv_template.c \
src/elpa2/kernels/real_avx-avx2_6hv_template.c \ src/elpa2/kernels/real_avx-avx2_6hv_template.c \
src/elpa2/kernels/real_avx512_2hv_template.c \ src/elpa2/kernels/real_avx512_2hv_template.c \
src/elpa2/kernels/real_avx512_4hv_template.c \ src/elpa2/kernels/real_avx512_4hv_template.c \
......
...@@ -125,6 +125,7 @@ ...@@ -125,6 +125,7 @@
#define _SIMD_XOR _mm_xor_ps #define _SIMD_XOR _mm_xor_ps
#endif #endif
#endif /* VEC_SET == 128 || VEC_SET == 1281 */ #endif /* VEC_SET == 128 || VEC_SET == 1281 */
#if VEC_SET == 256 #if VEC_SET == 256
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
#define offset 4 #define offset 4
...@@ -133,19 +134,27 @@ ...@@ -133,19 +134,27 @@
#define _SIMD_STORE _mm256_store_pd #define _SIMD_STORE _mm256_store_pd
#define _SIMD_ADD _mm256_add_pd #define _SIMD_ADD _mm256_add_pd
#define _SIMD_MUL _mm256_mul_pd #define _SIMD_MUL _mm256_mul_pd
//#define _SIMD_SUB _mm256_dub_pd #define _SIMD_SUB _mm256_sub_pd
#define _SIMD_XOR _mm256_xor_pd #define _SIMD_XOR _mm256_xor_pd
#define _SIMD_BROADCAST _mm256_broadcast_sd #define _SIMD_BROADCAST _mm256_broadcast_sd
#ifdef HAVE_AVX2 #ifdef HAVE_AVX2
#ifdef __FMA4__ #ifdef __FMA4__
#define __ELPA_USE_FMA__ #define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c) #define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
#define _SIMD_FMA _mm256_FMA_pd #define _mm256_NFMA_pd(a,b,c) _mm256_nmacc_pd(a,b,c)
#endif #error "This should be prop _mm256_msub_pd instead of _mm256_msub"
#define _mm256_FMSUB_pd(a,b,c) _mm256_msub(a,b,c)
#endif /* __FMA4__ */
#ifdef __AVX2__ #ifdef __AVX2__
#define __ELPA_USE_FMA__ #define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c) #define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
#define _mm256_NFMA_pd(a,b,c) _mm256_fnmadd_pd(a,b,c)
#define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
#endif /* __AVX2__ */
#ifdef __ELPA_USE_FMA__
#define _SIMD_FMA _mm256_FMA_pd #define _SIMD_FMA _mm256_FMA_pd
#define _SIMD_NFMA _mm256_NFMA_pd
#define _SIMD_FMSUB _mm256_FMSUB_pd
#endif #endif
#endif /* HAVE_AVX2 */ #endif /* HAVE_AVX2 */
#endif /* DOUBLE_PRECISION_REAL */ #endif /* DOUBLE_PRECISION_REAL */
...@@ -157,19 +166,27 @@ ...@@ -157,19 +166,27 @@
#define _SIMD_STORE _mm256_store_ps #define _SIMD_STORE _mm256_store_ps
#define _SIMD_ADD _mm256_add_ps #define _SIMD_ADD _mm256_add_ps
#define _SIMD_MUL _mm256_mul_ps #define _SIMD_MUL _mm256_mul_ps
//#define _SIMD_SUB _mm256_sub_ps #define _SIMD_SUB _mm256_sub_ps
#define _SIMD_XOR _mm256_xor_ps #define _SIMD_XOR _mm256_xor_ps
#define _SIMD_BROADCAST _mm256_broadcast_ss #define _SIMD_BROADCAST _mm256_broadcast_ss
#ifdef HAVE_AVX2 #ifdef HAVE_AVX2
#ifdef __FMA4__ #ifdef __FMA4__
#define __ELPA_USE_FMA__ #define __ELPA_USE_FMA__
#define _mm256_FMA_ps(a,b,c) _mm256_macc_ps(a,b,c) #define _mm256_FMA_ps(a,b,c) _mm256_macc_ps(a,b,c)
#define _SIMD_FMA _mm256_FMA_ps #define _mm256_NFMA_ps(a,b,c) _mm256_nmacc_ps(a,b,c)
#endif #error "This should be prop _mm256_msub_ps instead of _mm256_msub"
#define _mm256_FMSUB_ps(a,b,c) _mm256_msub(a,b,c)
#endif /* __FMA4__ */
#ifdef __AVX2__ #ifdef __AVX2__
#define __ELPA_USE_FMA__ #define __ELPA_USE_FMA__
#define _mm256_FMA_ps(a,b,c) _mm256_fmadd_ps(a,b,c) #define _mm256_FMA_ps(a,b,c) _mm256_fmadd_ps(a,b,c)
#define _mm256_NFMA_ps(a,b,c) _mm256_fnmadd_ps(a,b,c)
#define _mm256_FMSUB_ps(a,b,c) _mm256_fmsub_ps(a,b,c)
#endif /* __AVX2__ */
#ifdef __ELPA_USE_FMA__
#define _SIMD_FMA _mm256_FMA_ps #define _SIMD_FMA _mm256_FMA_ps
#define _SIMD_NFMA _mm256_NFMA_ps
#define _SIMD_FMSUB _mm256_FMSUB_ps
#endif #endif
#endif /* HAVE_AVX2 */ #endif /* HAVE_AVX2 */
#endif /* SINGLE_PRECISION_REAL */ #endif /* SINGLE_PRECISION_REAL */
...@@ -545,6 +562,34 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA ...@@ -545,6 +562,34 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
!f> end interface !f> end interface
!f>#endif !f>#endif
*/ */
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f> subroutine quad_hh_trafo_real_AVX_AVX2_4hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="quad_hh_trafo_real_AVX_AVX2_4hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_double) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f> subroutine quad_hh_trafo_real_AVX_AVX2_4hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="quad_hh_trafo_real_AVX_AVX2_4hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_float) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
/* /*
!f>#ifdef HAVE_SSE_INTRINSICS !f>#ifdef HAVE_SSE_INTRINSICS
!f> interface !f> interface
...@@ -719,7 +764,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA ...@@ -719,7 +764,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#endif /* BLOCK6 */ #endif /* BLOCK6 */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 256
#pragma ivdep #pragma ivdep
#endif #endif
for (i = BLOCK; i < nb; i++) for (i = BLOCK; i < nb; i++)
...@@ -762,6 +807,8 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA ...@@ -762,6 +807,8 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
} }
// Production level kernel calls with padding // Production level kernel calls with padding
#ifdef BLOCK2
#if VEC_SET == 128 || VEC_SET == 1281 #if VEC_SET == 128 || VEC_SET == 1281
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
#define STEP_SIZE 12 #define STEP_SIZE 12
...@@ -786,9 +833,8 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA ...@@ -786,9 +833,8 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#define ROW_LENGTH 48 #define ROW_LENGTH 48
#define UPPER_BOUND 40 #define UPPER_BOUND 40
#endif #endif
#endif /* AVX_AVX2 */ #endif /* VEC_SET == 256 */
#ifdef BLOCK2
for (i = 0; i < nq - UPPER_BOUND; i+= STEP_SIZE ) for (i = 0; i < nq - UPPER_BOUND; i+= STEP_SIZE )
{ {
CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_2hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s); CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_2hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
...@@ -930,58 +976,96 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA ...@@ -930,58 +976,96 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#endif /* BLOCK2 */ #endif /* BLOCK2 */
#ifdef BLOCK4 #ifdef BLOCK4
#undef ROW_LENGTH
#if VEC_SET == 128 || VEC_SET == 1281
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
for (i = 0; i < nq-4; i+=6) #define ROW_LENGTH 6
{ #define STEP_SIZE 6
CONCAT_4ARGS(hh_trafo_kernel_6_,SIMD_SET,_4hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); #define UPPER_BOUND 4
worked_on += 6; #endif
} #ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 12
#define STEP_SIZE 12
#define UPPER_BOUND 8
#endif #endif
#endif /* VEC_SET == 128 || VEC_SET == 1281 */
#if VEC_SET == 256
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 12
#define STEP_SIZE 12
#define UPPER_BOUND 8
#endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
for (i = 0; i < nq-8; i+=12) #define ROW_LENGTH 24
#define STEP_SIZE 24
#define UPPER_BOUND 16
#endif
#endif /* VEC_SET == 256 */
for (i = 0; i < nq - UPPER_BOUND; i+= STEP_SIZE )
{ {
CONCAT_4ARGS(hh_trafo_kernel_12_,SIMD_SET,_4hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_4hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += 12; worked_on += ROW_LENGTH;
} }
#endif
if (nq == i) if (nq == i)
{ {
return; return;
} }
#undef ROW_LENGTH
#if VEC_SET == 128 || VEC_SET == 1281
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
if (nq-i ==4) #define ROW_LENGTH 4
{ #endif
CONCAT_4ARGS(hh_trafo_kernel_4_,SIMD_SET,_4hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); #ifdef SINGLE_PRECISION_REAL
worked_on += 4; #define ROW_LENGTH 8
}
#endif #endif
#endif /* VEC_SET == 128 || VEC_SET == 1281 */
#if VEC_SET == 256
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 8
#endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
if (nq-i ==8) #define ROW_LENGTH 16
#endif
#endif /* VEC_SET == 256 */
if (nq-i == ROW_LENGTH )
{ {
CONCAT_4ARGS(hh_trafo_kernel_8_,SIMD_SET,_4hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_4hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += 8; worked_on += ROW_LENGTH;
} }
#endif
#undef ROW_LENGTH
#if VEC_SET == 128 || VEC_SET == 1281
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
if (nq-i == 2) #define ROW_LENGTH 2
{
CONCAT_4ARGS(hh_trafo_kernel_2_,SIMD_SET,_4hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += 2;
}
#endif #endif
#ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 4
#endif
#endif /* VEC_SET == 128 || VEC_SET == 1281 */
#if VEC_SET == 256
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 4
#endif
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
if (nq-i == 4) #define ROW_LENGTH 8
{
CONCAT_4ARGS(hh_trafo_kernel_4_,SIMD_SET,_4hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += 4;
}
#endif #endif
#endif /* VEC_SET == 256 */
if (nq-i == ROW_LENGTH )
{
CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_4hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
worked_on += ROW_LENGTH;
}
#endif /* BLOCK4 */ #endif /* BLOCK4 */
...@@ -1186,6 +1270,24 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -1186,6 +1270,24 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
__SIMD_DATATYPE h_4_1 = _SSE_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]); __SIMD_DATATYPE h_4_1 = _SSE_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
#endif #endif
#if VEC_SET == 256
__SIMD_DATATYPE h_2_1 = _SIMD_BROADCAST(&hh[ldh+1]);
__SIMD_DATATYPE h_3_2 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
__SIMD_DATATYPE h_3_1 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
__SIMD_DATATYPE h_4_3 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
__SIMD_DATATYPE h_4_2 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
__SIMD_DATATYPE h_4_1 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
#endif
#ifdef __ELPA_USE_FMA__
register __SIMD_DATATYPE w1 = _SIMD_FMA(a3_1, h_4_3, a4_1);
w1 = _SIMD_FMA(a2_1, h_4_2, w1);
w1 = _SIMD_FMA(a1_1, h_4_1, w1);
register __SIMD_DATATYPE z1 = _SIMD_FMA(a2_1, h_3_2, a3_1);
z1 = _SIMD_FMA(a1_1, h_3_1, z1);
register __SIMD_DATATYPE y1 = _SIMD_FMA(a1_1, h_2_1, a2_1);
register __SIMD_DATATYPE x1 = a1_1;
#else
register __SIMD_DATATYPE w1 = _SIMD_ADD(a4_1, _SIMD_MUL(a3_1, h_4_3)); register __SIMD_DATATYPE w1 = _SIMD_ADD(a4_1, _SIMD_MUL(a3_1, h_4_3));
w1 = _SIMD_ADD(w1, _SIMD_MUL(a2_1, h_4_2)); w1 = _SIMD_ADD(w1, _SIMD_MUL(a2_1, h_4_2));
w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1)); w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1));
...@@ -1193,12 +1295,22 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -1193,12 +1295,22 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
z1 = _SIMD_ADD(z1, _SIMD_MUL(a1_1, h_3_1)); z1 = _SIMD_ADD(z1, _SIMD_MUL(a1_1, h_3_1));
register __SIMD_DATATYPE y1 = _SIMD_ADD(a2_1, _SIMD_MUL(a1_1, h_2_1)); register __SIMD_DATATYPE y1 = _SIMD_ADD(a2_1, _SIMD_MUL(a1_1, h_2_1));
register __SIMD_DATATYPE x1 = a1_1; register __SIMD_DATATYPE x1 = a1_1;
#endif /* __ELPA_USE_FMA__ */
__SIMD_DATATYPE a1_2 = _SIMD_LOAD(&q[(ldq*3)+offset]); __SIMD_DATATYPE a1_2 = _SIMD_LOAD(&q[(ldq*3)+offset]);
__SIMD_DATATYPE a2_2 = _SIMD_LOAD(&q[(ldq*2)+offset]); __SIMD_DATATYPE a2_2 = _SIMD_LOAD(&q[(ldq*2)+offset]);
__SIMD_DATATYPE a3_2 = _SIMD_LOAD(&q[ldq+offset]); __SIMD_DATATYPE a3_2 = _SIMD_LOAD(&q[ldq+offset]);
__SIMD_DATATYPE a4_2 = _SIMD_LOAD(&q[0+offset]); __SIMD_DATATYPE a4_2 = _SIMD_LOAD(&q[0+offset]);
#ifdef __ELPA_USE_FMA__
register __SIMD_DATATYPE w2 = _SIMD_FMA(a3_2, h_4_3, a4_2);
w2 = _SIMD_FMA(a2_2, h_4_2, w2);
w2 = _SIMD_FMA(a1_2, h_4_1, w2);
register __SIMD_DATATYPE z2 = _SIMD_FMA(a2_2, h_3_2, a3_2);
z2 = _SIMD_FMA(a1_2, h_3_1, z2);
register __SIMD_DATATYPE y2 = _SIMD_FMA(a1_2, h_2_1, a2_2);
register __SIMD_DATATYPE x2 = a1_2;
#else
register __SIMD_DATATYPE w2 = _SIMD_ADD(a4_2, _SIMD_MUL(a3_2, h_4_3)); register __SIMD_DATATYPE w2 = _SIMD_ADD(a4_2, _SIMD_MUL(a3_2, h_4_3));
w2 = _SIMD_ADD(w2, _SIMD_MUL(a2_2, h_4_2)); w2 = _SIMD_ADD(w2, _SIMD_MUL(a2_2, h_4_2));
w2 = _SIMD_ADD(w2, _SIMD_MUL(a1_2, h_4_1)); w2 = _SIMD_ADD(w2, _SIMD_MUL(a1_2, h_4_1));
...@@ -1206,12 +1318,22 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -1206,12 +1318,22 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
z2 = _SIMD_ADD(z2, _SIMD_MUL(a1_2, h_3_1)); z2 = _SIMD_ADD(z2, _SIMD_MUL(a1_2, h_3_1));
register __SIMD_DATATYPE y2 = _SIMD_ADD(a2_2, _SIMD_MUL(a1_2, h_2_1)); register __SIMD_DATATYPE y2 = _SIMD_ADD(a2_2, _SIMD_MUL(a1_2, h_2_1));
register __SIMD_DATATYPE x2 = a1_2; register __SIMD_DATATYPE x2 = a1_2;
#endif /* __ELPA_USE_FMA__ */
__SIMD_DATATYPE a1_3 = _SIMD_LOAD(&q[(ldq*3)+2*offset]); __SIMD_DATATYPE a1_3 = _SIMD_LOAD(&q[(ldq*3)+2*offset]);
__SIMD_DATATYPE a2_3 = _SIMD_LOAD(&q[(ldq*2)+2*offset]); __SIMD_DATATYPE a2_3 = _SIMD_LOAD(&q[(ldq*2)+2*offset]);
__SIMD_DATATYPE a3_3 = _SIMD_LOAD(&q[ldq+2*offset]); __SIMD_DATATYPE a3_3 = _SIMD_LOAD(&q[ldq+2*offset]);
__SIMD_DATATYPE a4_3 = _SIMD_LOAD(&q[0+2*offset]); __SIMD_DATATYPE a4_3 = _SIMD_LOAD(&q[0+2*offset]);
#ifdef __ELPA_USE_FMA__
register __SIMD_DATATYPE w3 = _SIMD_FMA(a3_3, h_4_3, a4_3);
w3 = _SIMD_FMA(a2_3, h_4_2, w3);
w3 = _SIMD_FMA(a1_3, h_4_1, w3);
register __SIMD_DATATYPE z3 = _SIMD_FMA(a2_3, h_3_2, a3_3);
z3 = _SIMD_FMA(a1_3, h_3_1, z3);
register __SIMD_DATATYPE y3 = _SIMD_FMA(a1_3, h_2_1, a2_3);
register __SIMD_DATATYPE x3 = a1_3;
#else
register __SIMD_DATATYPE w3 = _SIMD_ADD(a4_3, _SIMD_MUL(a3_3, h_4_3)); register __SIMD_DATATYPE w3 = _SIMD_ADD(a4_3, _SIMD_MUL(a3_3, h_4_3));
w3 = _SIMD_ADD(w3, _SIMD_MUL(a2_3, h_4_2)); w3 = _SIMD_ADD(w3, _SIMD_MUL(a2_3, h_4_2));
w3 = _SIMD_ADD(w3, _SIMD_MUL(a1_3, h_4_1)); w3 = _SIMD_ADD(w3, _SIMD_MUL(a1_3, h_4_1));
...@@ -1219,12 +1341,22 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -1219,12 +1341,22 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
z3 = _SIMD_ADD(z3, _SIMD_MUL(a1_3, h_3_1)); z3 = _SIMD_ADD(z3, _SIMD_MUL(a1_3, h_3_1));
register __SIMD_DATATYPE y3 = _SIMD_ADD(a2_3, _SIMD_MUL(a1_3, h_2_1)); register __SIMD_DATATYPE y3 = _SIMD_ADD(a2_3, _SIMD_MUL(a1_3, h_2_1));
register __SIMD_DATATYPE x3 = a1_3; register __SIMD_DATATYPE x3 = a1_3;
#endif /* __ELPA_USE_FMA__ */
__SIMD_DATATYPE a1_4 = _SIMD_LOAD(&q[(ldq*3)+3*offset]); __SIMD_DATATYPE a1_4 = _SIMD_LOAD(&q[(ldq*3)+3*offset]);
__SIMD_DATATYPE a2_4 = _SIMD_LOAD(&q[(ldq*2)+3*offset]); __SIMD_DATATYPE a2_4 = _SIMD_LOAD(&q[(ldq*2)+3*offset]);
__SIMD_DATATYPE a3_4 = _SIMD_LOAD(&q[ldq+3*offset]); __SIMD_DATATYPE a3_4 = _SIMD_LOAD(&q[ldq+3*offset]);
__SIMD_DATATYPE a4_4 = _SIMD_LOAD(&q[0+3*offset]); __SIMD_DATATYPE a4_4 = _SIMD_LOAD(&q[0+3*offset]);
#ifdef __ELPA_USE_FMA__
register __SIMD_DATATYPE w4 = _SIMD_FMA(a3_4, h_4_3, a4_4);
w4 = _SIMD_FMA(a2_4, h_4_2, w4);
w4 = _SIMD_FMA(a1_4, h_4_1, w4);
register __SIMD_DATATYPE z4 = _SIMD_FMA(a2_4, h_3_2, a3_4);
z4 = _SIMD_FMA(a1_4, h_3_1, z4);
register __SIMD_DATATYPE y4 = _SIMD_FMA(a1_4, h_2_1, a2_4);
register __SIMD_DATATYPE x4 = a1_4;
#else
register __SIMD_DATATYPE w4 = _SIMD_ADD(a4_4, _SIMD_MUL(a3_4, h_4_3)); register __SIMD_DATATYPE w4 = _SIMD_ADD(a4_4, _SIMD_MUL(a3_4, h_4_3));
w4 = _SIMD_ADD(w4, _SIMD_MUL(a2_4, h_4_2)); w4 = _SIMD_ADD(w4, _SIMD_MUL(a2_4, h_4_2));
w4 = _SIMD_ADD(w4, _SIMD_MUL(a1_4, h_4_1)); w4 = _SIMD_ADD(w4, _SIMD_MUL(a1_4, h_4_1));
...@@ -1232,12 +1364,22 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -1232,12 +1364,22 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
z4 = _SIMD_ADD(z4, _SIMD_MUL(a1_4, h_3_1)); z4 = _SIMD_ADD(z4, _SIMD_MUL(a1_4, h_3_1));
register __SIMD_DATATYPE y4 = _SIMD_ADD(a2_4, _SIMD_MUL(a1_4, h_2_1)); register __SIMD_DATATYPE y4 = _SIMD_ADD(a2_4, _SIMD_MUL(a1_4, h_2_1));
register __SIMD_DATATYPE x4 = a1_4; register __SIMD_DATATYPE x4 = a1_4;
#endif /* __ELPA_USE_FMA__ */
__SIMD_DATATYPE a1_5 = _SIMD_LOAD(&q[(ldq*3)+4*offset]); __SIMD_DATATYPE a1_5 = _SIMD_LOAD(&q[(ldq*3)+4*offset]);
__SIMD_DATATYPE a2_5 = _SIMD_LOAD(&q[(ldq*2)+4*offset]); __SIMD_DATATYPE a2_5 = _SIMD_LOAD(&q[(ldq*2)+4*offset]);
__SIMD_DATATYPE a3_5 = _SIMD_LOAD(&q[ldq+4*offset]); __SIMD_DATATYPE a3_5 = _SIMD_LOAD(&q[ldq+4*offset]);
__SIMD_DATATYPE a4_5 = _SIMD_LOAD(&q[0+4*offset]); __SIMD_DATATYPE a4_5 = _SIMD_LOAD(&q[0+4*offset]);
#ifdef __ELPA_USE_FMA__
register __SIMD_DATATYPE w5 = _SIMD_FMA(a3_5, h_4_3, a4_5);
w5 = _SIMD_FMA(a2_5, h_4_2, w5);
w5 = _SIMD_FMA(a1_5, h_4_1, w5);
register __SIMD_DATATYPE z5 = _SIMD_FMA(a2_5, h_3_2, a3_5);
z5 = _SIMD_FMA(a1_5, h_3_1, z5);
register __SIMD_DATATYPE y5 = _SIMD_FMA(a1_5, h_2_1, a2_5);
register __SIMD_DATATYPE x5 = a1_5;
#else
register __SIMD_DATATYPE w5 = _SIMD_ADD(a4_5, _SIMD_MUL(a3_5, h_4_3)); register __SIMD_DATATYPE w5 = _SIMD_ADD(a4_5, _SIMD_MUL(a3_5, h_4_3));
w5 = _SIMD_ADD(w5, _SIMD_MUL(a2_5, h_4_2)); w5 = _SIMD_ADD(w5, _SIMD_MUL(a2_5, h_4_2));
w5 = _SIMD_ADD(w5, _SIMD_MUL(a1_5, h_4_1)); w5 = _SIMD_ADD(w5, _SIMD_MUL(a1_5, h_4_1));
...@@ -1245,12 +1387,22 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -1245,12 +1387,22 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
z5 = _SIMD_ADD(z5, _SIMD_MUL(a1_5, h_3_1)); z5 = _SIMD_ADD(z5, _SIMD_MUL(a1_5, h_3_1));
register __SIMD_DATATYPE y5 = _SIMD_ADD(a2_5, _SIMD_MUL(a1_5, h_2_1)); register __SIMD_DATATYPE y5 = _SIMD_ADD(a2_5, _SIMD_MUL(a1_5, h_2_1));
register __SIMD_DATATYPE x5 = a1_5; register __SIMD_DATATYPE x5 = a1_5;
#endif /* __ELPA_USE_FMA__ */
__SIMD_DATATYPE a1_6 = _SIMD_LOAD(&q[(ldq*3)+5*offset]); __SIMD_DATATYPE a1_6 = _SIMD_LOAD(&q[(ldq*3)+5*offset]);
__SIMD_DATATYPE a2_6 = _SIMD_LOAD(&q[(ldq*2)+5*offset]); __SIMD_DATATYPE a2_6 = _SIMD_LOAD(&q[(ldq*2)+5*offset]);
__SIMD_DATATYPE a3_6 = _SIMD_LOAD(&q[ldq+5*offset]); __SIMD_DATATYPE a3_6 = _SIMD_LOAD(&q[ldq+5*offset]);
__SIMD_DATATYPE a4_6 = _SIMD_LOAD(&q[0+5*offset]); __SIMD_DATATYPE a4_6 = _SIMD_LOAD(&q[0+5*offset]);
#ifdef __ELPA_USE_FMA__
register __SIMD_DATATYPE w6 = _SIMD_FMA(a3_6, h_4_3, a4_6);
w6 = _SIMD_FMA(a2_6, h_4_2, w6);
w6 = _SIMD_FMA(a1_6, h_4_1, w6);
register __SIMD_DATATYPE z6 = _SIMD_FMA(a2_6, h_3_2, a3_6);
z6 = _SIMD_FMA(a1_6, h_3_1, z6);
register __SIMD_DATATYPE y6 = _SIMD_FMA(a1_6, h_2_1, a2_6);
register __SIMD_DATATYPE x6 = a1_6;
#else
register __SIMD_DATATYPE w6 = _SIMD_ADD(a4_6, _SIMD_MUL(a3_6, h_4_3)); register __SIMD_DATATYPE w6 = _SIMD_ADD(a4_6, _SIMD_MUL(a3_6, h_4_3));
w6 = _SIMD_ADD(w6, _SIMD_MUL(a2_6, h_4_2)); w6 = _SIMD_ADD(w6, _SIMD_MUL(a2_6, h_4_2));
w6 = _SIMD_ADD(w6, _SIMD_MUL(a1_6, h_4_1)); w6 = _SIMD_ADD(w6, _SIMD_MUL(a1_6, h_4_1));
...@@ -1258,6 +1410,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -1258,6 +1410,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
z6 = _SIMD_ADD(z6, _SIMD_MUL(a1_6, h_3_1)); z6 = _SIMD_ADD(z6, _SIMD_MUL(a1_6, h_3_1));
register __SIMD_DATATYPE y6 = _SIMD_ADD(a2_6, _SIMD_MUL(a1_6, h_2_1)); register __SIMD_DATATYPE y6 = _SIMD_ADD(a2_6, _SIMD_MUL(a1_6, h_2_1));
register __SIMD_DATATYPE x6 = a1_6; register __SIMD_DATATYPE x6 = a1_6;
#endif /* __ELPA_USE_FMA__ */
__SIMD_DATATYPE q1; __SIMD_DATATYPE q1;
__SIMD_DATATYPE q2; __SIMD_DATATYPE q2;
...@@ -1563,12 +1716,26 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -1563,12 +1716,26 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
h3 = _SSE_SET(hh[(ldh*2)+i-(BLOCK-3)], hh[(ldh*2)+i-(BLOCK-3)]); h3 = _SSE_SET(hh[(ldh*2)+i-(BLOCK-3)], hh[(ldh*2)+i-(BLOCK-3)]);
#endif #endif
#if VEC_SET == 256
h3 = _SIMD_BROADCAST(&hh[(ldh*2)+i-(BLOCK-3)]);
#endif
#ifdef __ELPA_USE_FMA__
z1 = _SIMD_FMA(q1, h3, z1);
z2 = _SIMD_FMA(q2, h3, z2);
z3 = _SIMD_FMA(q3, h3, z3);
z4 = _SIMD_FMA(q4, h3, z4);
z5 = _SIMD_FMA(q5, h3, z5);
z6 = _SIMD_FMA(q6, h3, z6);
#else
z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3)); z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3)); z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3));
z3 = _SIMD_ADD(z3, _SIMD_MUL(q3,h3)); z3 = _SIMD_ADD(z3, _SIMD_MUL(q3,h3));
z4 = _SIMD_ADD(z4, _SIMD_MUL(q4,h3)); z4 = _SIMD_ADD(z4, _SIMD_MUL(q4,h3));
z5 = _SIMD_ADD(z5, _SIMD_MUL(q5,h3)); z5 = _SIMD_ADD(z5, _SIMD_MUL(q5,h3));
z6 = _SIMD_ADD(z6, _SIMD_MUL(q6,h3)); z6 = _SIMD_ADD(z6, _SIMD_MUL(q6,h3));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128
h4 = _SSE_SET1(hh[(ldh*3)+i-(BLOCK-4)]); h4 = _SSE_SET1(hh[(ldh*3)+i-(BLOCK-4)]);
#endif #endif
...@@ -1577,13 +1744,27 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -1577,13 +1744,27 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
h4 = _SSE_SET(hh[(ldh*3)+i-(BLOCK-4)], hh[(ldh*3)+i-(BLOCK-4)]); h4 = _SSE_SET(hh[(ldh*3)+i-(BLOCK-4)], hh[(ldh*3)+i-(BLOCK-4)]);
#endif #endif
#if VEC_SET == 256
h4 = _SIMD_BROADCAST(&hh[(ldh*3)+i-(BLOCK-4)]);
#endif
#ifdef __ELPA_USE_FMA__
w1 = _SIMD_FMA(q1, h4, w1);
w2 = _SIMD_FMA(q2, h4, w2);
w3 = _SIMD_FMA(q3, h4, w3);
w4 = _SIMD_FMA(q4, h4, w4);
w5 = _SIMD_FMA(q5, h4, w5);
w6 = _SIMD_FMA(q6, h4, w6);
#else
w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4)); w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4));
w2 = _SIMD_ADD(w2, _SIMD_MUL(q2,h4)); w2 = _SIMD_ADD(w2, _SIMD_MUL(q2,h4));
w3 = _SIMD_ADD(w3, _SIMD_MUL(q3,h4)); w3 = _SIMD_ADD(w3, _SIMD_MUL(q3,h4));
w4 = _SIMD_ADD(w4, _SIMD_MUL(q4,h4)); w4 = _SIMD_ADD(w4, _SIMD_MUL(q4,h4));
w5 = _SIMD_ADD(w5, _SIMD_MUL(q5,h4)); w5 = _SIMD_ADD(w5, _SIMD_MUL(q5,h4));
w6 = _SIMD_ADD(w6, _SIMD_MUL(q6,h4)); w6 = _SIMD_ADD(w6, _SIMD_MUL(q6,h4));
#endif /* __ELPA_USE_FMA__ */
#endif /* BLOCK4 || BLOCK6 */ #endif /* BLOCK4 || BLOCK6 */
#ifdef BLOCK6 #ifdef BLOCK6
...@@ -1666,12 +1847,26 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -1666,12 +1847,26 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
h2 = _SSE_SET(hh[ldh+nb-(BLOCK-2)], hh[ldh+nb-(BLOCK-2)]); h2 = _SSE_SET(hh[ldh+nb-(BLOCK-2)], hh[ldh+nb-(BLOCK-2)]);
#endif #endif
#if VEC_SET == 256
h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-2)]);
#endif
#ifdef __FMA4_
y1 = _SIMD_FMA(q1, h2, y1);
y2 = _SIMD_FMA(q2, h2, y2);
y3 = _SIMD_FMA(q3, h2, y3);
y4 = _SIMD_FMA(q4, h2, y4);