Commit f71e670e authored by Andreas Marek's avatar Andreas Marek
Browse files

Unify also 512bit BLOCK6 real kernel

parent dfa6ca91
...@@ -775,7 +775,6 @@ EXTRA_DIST = \ ...@@ -775,7 +775,6 @@ EXTRA_DIST = \
src/elpa2/kernels/complex_sse_1hv_template.c \ src/elpa2/kernels/complex_sse_1hv_template.c \
src/elpa2/kernels/complex_sse_2hv_template.c \ src/elpa2/kernels/complex_sse_2hv_template.c \
src/elpa2/kernels/complex_template.F90 \ src/elpa2/kernels/complex_template.F90 \
src/elpa2/kernels/real_avx512_6hv_template.c \
src/elpa2/kernels/real_vsx_2hv_template.c \ src/elpa2/kernels/real_vsx_2hv_template.c \
src/elpa2/kernels/real_vsx_4hv_template.c \ src/elpa2/kernels/real_vsx_4hv_template.c \
src/elpa2/kernels/real_vsx_6hv_template.c \ src/elpa2/kernels/real_vsx_6hv_template.c \
......
...@@ -764,7 +764,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA ...@@ -764,7 +764,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
/* /*
!f>#ifdef HAVE_SSE_INTRINSICS !f>#ifdef HAVE_SSE_INTRINSICS
!f> interface !f> interface
!f> subroutine hexa_hh_trafo_real_sse_6hv_double(q, hh, pnb, pnq, pldq, pldh) & !f> subroutine hexa_hh_trafo_real_SSE_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_SSE_6hv_double") !f> bind(C, name="hexa_hh_trafo_real_SSE_6hv_double")
!f> use, intrinsic :: iso_c_binding !f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
...@@ -777,7 +777,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA ...@@ -777,7 +777,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
/* /*
!f>#ifdef HAVE_SPARC64_SSE !f>#ifdef HAVE_SPARC64_SSE
!f> interface !f> interface
!f> subroutine hexa_hh_trafo_real_sparc64_6hv_double(q, hh, pnb, pnq, pldq, pldh) & !f> subroutine hexa_hh_trafo_real_SPARC64_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_SPARC64_6hv_double") !f> bind(C, name="hexa_hh_trafo_real_SPARC64_6hv_double")
!f> use, intrinsic :: iso_c_binding !f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
...@@ -790,7 +790,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA ...@@ -790,7 +790,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
/* /*
!f>#ifdef HAVE_SSE_INTRINSICS !f>#ifdef HAVE_SSE_INTRINSICS
!f> interface !f> interface
!f> subroutine hexa_hh_trafo_real_sse_6hv_single(q, hh, pnb, pnq, pldq, pldh) & !f> subroutine hexa_hh_trafo_real_SSE_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_SSE_6hv_single") !f> bind(C, name="hexa_hh_trafo_real_SSE_6hv_single")
!f> use, intrinsic :: iso_c_binding !f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
...@@ -803,7 +803,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA ...@@ -803,7 +803,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
/* /*
!f>#ifdef HAVE_SPARC64_SSE !f>#ifdef HAVE_SPARC64_SSE
!f> interface !f> interface
!f> subroutine hexa_hh_trafo_real_sparc64_6hv_single(q, hh, pnb, pnq, pldq, pldh) & !f> subroutine hexa_hh_trafo_real_SPARC64_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_SPARC64_6hv_single") !f> bind(C, name="hexa_hh_trafo_real_SPARC64_6hv_single")
!f> use, intrinsic :: iso_c_binding !f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
...@@ -840,7 +840,32 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA ...@@ -840,7 +840,32 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
!f> end interface !f> end interface
!f>#endif !f>#endif
*/ */
/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f> subroutine hexa_hh_trafo_real_AVX512_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_AVX512_6hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_double) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f> subroutine hexa_hh_trafo_real_AVX512_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_AVX512_6hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_float) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int* pnb, int* pnq, int* pldq, int* pldh) void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int* pnb, int* pnq, int* pldq, int* pldh)
{ {
...@@ -1393,6 +1418,19 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA ...@@ -1393,6 +1418,19 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#endif #endif
#endif /* VEC_SET == 256 */ #endif /* VEC_SET == 256 */
#if VEC_SET == 512
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 32
#define STEP_SIZE 32
#define UPPER_BOUND 24
#endif
#ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 64
#define STEP_SIZE 64
#define UPPER_BOUND 48
#endif
#endif /* VEC_SET == 512 */
for (i = 0; i < nq - UPPER_BOUND; i+= STEP_SIZE) for (i = 0; i < nq - UPPER_BOUND; i+= STEP_SIZE)
{ {
CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_6hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, scalarprods); CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_6hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, scalarprods);
...@@ -1422,11 +1460,55 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA ...@@ -1422,11 +1460,55 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#endif #endif
#endif /* VEC_SET == 256 */ #endif /* VEC_SET == 256 */
#if VEC_SET == 512
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 24
#endif
#ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 48
#endif
#endif /* VEC_SET == 512 */
if (nq -i == ROW_LENGTH )
{
CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_6hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += ROW_LENGTH;
}
#if VEC_SET == 512
#if VEC_SET == 512
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 16
#endif
#ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 32
#endif
#endif /* VEC_SET == 512 */
if (nq -i == ROW_LENGTH )
{
CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_6hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += ROW_LENGTH;
}
#if VEC_SET == 512
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 8
#endif
#ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 16
#endif
#endif /* VEC_SET == 512 */
if (nq -i == ROW_LENGTH ) if (nq -i == ROW_LENGTH )
{ {
CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_6hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, scalarprods); CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_6hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += ROW_LENGTH; worked_on += ROW_LENGTH;
} }
#endif /* VEC_SET == 512 */
#endif /* BLOCK6 */ #endif /* BLOCK6 */
...@@ -1772,7 +1854,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -1772,7 +1854,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
__SIMD_DATATYPE a5_1 = _SIMD_LOAD(&q[ldq]); __SIMD_DATATYPE a5_1 = _SIMD_LOAD(&q[ldq]);
__SIMD_DATATYPE a6_1 = _SIMD_LOAD(&q[0]); __SIMD_DATATYPE a6_1 = _SIMD_LOAD(&q[0]);
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
__SIMD_DATATYPE h_6_5 = _SIMD_SET1(hh[(ldh*5)+1]); __SIMD_DATATYPE h_6_5 = _SIMD_SET1(hh[(ldh*5)+1]);
__SIMD_DATATYPE h_6_4 = _SIMD_SET1(hh[(ldh*5)+2]); __SIMD_DATATYPE h_6_4 = _SIMD_SET1(hh[(ldh*5)+2]);
__SIMD_DATATYPE h_6_3 = _SIMD_SET1(hh[(ldh*5)+3]); __SIMD_DATATYPE h_6_3 = _SIMD_SET1(hh[(ldh*5)+3]);
...@@ -1810,7 +1892,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -1810,7 +1892,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
t1 = _SIMD_ADD(t1, _SIMD_MUL(a1_1, h_6_1)); t1 = _SIMD_ADD(t1, _SIMD_MUL(a1_1, h_6_1));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
__SIMD_DATATYPE h_5_4 = _SIMD_SET1(hh[(ldh*4)+1]); __SIMD_DATATYPE h_5_4 = _SIMD_SET1(hh[(ldh*4)+1]);
__SIMD_DATATYPE h_5_3 = _SIMD_SET1(hh[(ldh*4)+2]); __SIMD_DATATYPE h_5_3 = _SIMD_SET1(hh[(ldh*4)+2]);
__SIMD_DATATYPE h_5_2 = _SIMD_SET1(hh[(ldh*4)+3]); __SIMD_DATATYPE h_5_2 = _SIMD_SET1(hh[(ldh*4)+3]);
...@@ -1843,7 +1925,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -1843,7 +1925,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
v1 = _SIMD_ADD(v1, _SIMD_MUL(a1_1, h_5_1)); v1 = _SIMD_ADD(v1, _SIMD_MUL(a1_1, h_5_1));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
__SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]); __SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]);
__SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]); __SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]);
__SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]); __SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]);
...@@ -1871,7 +1953,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -1871,7 +1953,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1)); w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
__SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]); __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]);
__SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]); __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]);
__SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]); __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]);
...@@ -2245,7 +2327,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -2245,7 +2327,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#ifdef BLOCK6 #ifdef BLOCK6
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h5 = _SIMD_SET1(hh[(ldh*4)+i-1]); h5 = _SIMD_SET1(hh[(ldh*4)+i-1]);
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
...@@ -2271,7 +2353,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -2271,7 +2353,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
v6 = _SIMD_ADD(v6, _SIMD_MUL(q6,h5)); v6 = _SIMD_ADD(v6, _SIMD_MUL(q6,h5));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h6 = _SIMD_SET1(hh[(ldh*5)+i]); h6 = _SIMD_SET1(hh[(ldh*5)+i]);
#endif #endif
...@@ -2496,7 +2578,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -2496,7 +2578,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#ifdef BLOCK6 #ifdef BLOCK6
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h4 = _SIMD_SET1(hh[(ldh*3)+nb-2]); h4 = _SIMD_SET1(hh[(ldh*3)+nb-2]);
#endif #endif
...@@ -2524,7 +2606,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -2524,7 +2606,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
w6 = _SIMD_ADD(w6, _SIMD_MUL(q6,h4)); w6 = _SIMD_ADD(w6, _SIMD_MUL(q6,h4));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]); h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]);
#endif #endif
...@@ -2552,7 +2634,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -2552,7 +2634,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
v6 = _SIMD_ADD(v6, _SIMD_MUL(q6,h5)); v6 = _SIMD_ADD(v6, _SIMD_MUL(q6,h5));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h1 = _SIMD_SET1(hh[nb-4]); h1 = _SIMD_SET1(hh[nb-4]);
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
...@@ -2585,7 +2667,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -2585,7 +2667,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
x6 = _SIMD_ADD(x6, _SIMD_MUL(q6,h1)); x6 = _SIMD_ADD(x6, _SIMD_MUL(q6,h1));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h2 = _SIMD_SET1(hh[ldh+nb-3]); h2 = _SIMD_SET1(hh[ldh+nb-3]);
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
...@@ -2611,7 +2693,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -2611,7 +2693,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
y6 = _SIMD_ADD(y6, _SIMD_MUL(q6,h2)); y6 = _SIMD_ADD(y6, _SIMD_MUL(q6,h2));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h3 = _SIMD_SET1(hh[(ldh*2)+nb-2]); h3 = _SIMD_SET1(hh[(ldh*2)+nb-2]);
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
...@@ -2637,7 +2719,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -2637,7 +2719,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
z6 = _SIMD_ADD(z6, _SIMD_MUL(q6,h3)); z6 = _SIMD_ADD(z6, _SIMD_MUL(q6,h3));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h4 = _SIMD_SET1(hh[(ldh*3)+nb-1]); h4 = _SIMD_SET1(hh[(ldh*3)+nb-1]);
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
...@@ -2663,7 +2745,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -2663,7 +2745,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
w6 = _SIMD_ADD(w6, _SIMD_MUL(q6,h4)); w6 = _SIMD_ADD(w6, _SIMD_MUL(q6,h4));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h1 = _SIMD_SET1(hh[nb-3]); h1 = _SIMD_SET1(hh[nb-3]);
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
...@@ -2696,7 +2778,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -2696,7 +2778,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
x6 = _SIMD_ADD(x6, _SIMD_MUL(q6,h1)); x6 = _SIMD_ADD(x6, _SIMD_MUL(q6,h1));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h2 = _SIMD_SET1(hh[ldh+nb-2]); h2 = _SIMD_SET1(hh[ldh+nb-2]);
#endif #endif
...@@ -2724,7 +2806,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -2724,7 +2806,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
y6 = _SIMD_ADD(y6, _SIMD_MUL(q6,h2)); y6 = _SIMD_ADD(y6, _SIMD_MUL(q6,h2));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]); h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]);
#endif #endif
...@@ -2752,7 +2834,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -2752,7 +2834,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
z6 = _SIMD_ADD(z6, _SIMD_MUL(q6,h3)); z6 = _SIMD_ADD(z6, _SIMD_MUL(q6,h3));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h1 = _SIMD_SET1(hh[nb-2]); h1 = _SIMD_SET1(hh[nb-2]);
#endif #endif
...@@ -2786,7 +2868,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -2786,7 +2868,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
x6 = _SIMD_ADD(x6, _SIMD_MUL(q6,h1)); x6 = _SIMD_ADD(x6, _SIMD_MUL(q6,h1));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h2 = _SIMD_SET1(hh[ldh+nb-1]); h2 = _SIMD_SET1(hh[ldh+nb-1]);
#endif #endif
...@@ -2814,7 +2896,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -2814,7 +2896,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
y6 = _SIMD_ADD(y6, _SIMD_MUL(q6,h2)); y6 = _SIMD_ADD(y6, _SIMD_MUL(q6,h2));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h1 = _SIMD_SET1(hh[nb-1]); h1 = _SIMD_SET1(hh[nb-1]);
#endif #endif
...@@ -3529,7 +3611,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -3529,7 +3611,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#ifdef BLOCK6 #ifdef BLOCK6
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h6 = _SIMD_SET1(hh[(ldh*5)+1]); h6 = _SIMD_SET1(hh[(ldh*5)+1]);
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
...@@ -3574,7 +3656,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -3574,7 +3656,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
_SIMD_STORE(&q[(ldq+3*offset)],q4); _SIMD_STORE(&q[(ldq+3*offset)],q4);
_SIMD_STORE(&q[(ldq+4*offset)],q5); _SIMD_STORE(&q[(ldq+4*offset)],q5);
_SIMD_STORE(&q[(ldq+5*offset)],q6); _SIMD_STORE(&q[(ldq+5*offset)],q6);
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h5 = _SIMD_SET1(hh[(ldh*4)+1]); h5 = _SIMD_SET1(hh[(ldh*4)+1]);
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
...@@ -3613,7 +3696,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -3613,7 +3696,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5)); q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h6 = _SIMD_SET1(hh[(ldh*5)+2]); h6 = _SIMD_SET1(hh[(ldh*5)+2]);
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
...@@ -3646,7 +3729,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -3646,7 +3729,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
_SIMD_STORE(&q[(ldq*2)+4*offset],q5); _SIMD_STORE(&q[(ldq*2)+4*offset],q5);
_SIMD_STORE(&q[(ldq*2)+5*offset],q6); _SIMD_STORE(&q[(ldq*2)+5*offset],q6);
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h4 = _SIMD_SET1(hh[(ldh*3)+1]); h4 = _SIMD_SET1(hh[(ldh*3)+1]);
#endif #endif
...@@ -3687,7 +3770,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -3687,7 +3770,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4)); q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h5 = _SIMD_SET1(hh[(ldh*4)+2]); h5 = _SIMD_SET1(hh[(ldh*4)+2]);
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
...@@ -3713,7 +3796,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -3713,7 +3796,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5)); q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h6 = _SIMD_SET1(hh[(ldh*5)+3]); h6 = _SIMD_SET1(hh[(ldh*5)+3]);
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
...@@ -3745,7 +3828,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -3745,7 +3828,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
_SIMD_STORE(&q[(ldq*3)+3*offset],q4); _SIMD_STORE(&q[(ldq*3)+3*offset],q4);
_SIMD_STORE(&q[(ldq*3)+4*offset],q5); _SIMD_STORE(&q[(ldq*3)+4*offset],q5);
_SIMD_STORE(&q[(ldq*3)+5*offset],q6); _SIMD_STORE(&q[(ldq*3)+5*offset],q6);
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h3 = _SIMD_SET1(hh[(ldh*2)+1]); h3 = _SIMD_SET1(hh[(ldh*2)+1]);
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
...@@ -3783,7 +3867,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -3783,7 +3867,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(z6, h3)); q6 = _SIMD_SUB(q6, _SIMD_MUL(z6, h3));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h4 = _SIMD_SET1(hh[(ldh*3)+2]); h4 = _SIMD_SET1(hh[(ldh*3)+2]);
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
...@@ -3809,7 +3893,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -3809,7 +3893,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4)); q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4));
#endif #endif
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h5 = _SIMD_SET1(hh[(ldh*4)+3]); h5 = _SIMD_SET1(hh[(ldh*4)+3]);
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
...@@ -3835,7 +3919,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -3835,7 +3919,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5)); q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h6 = _SIMD_SET1(hh[(ldh*5)+4]); h6 = _SIMD_SET1(hh[(ldh*5)+4]);
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
...@@ -3867,7 +3951,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -3867,7 +3951,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
_SIMD_STORE(&q[(ldq*4)+3*offset],q4); _SIMD_STORE(&q[(ldq*4)+3*offset],q4);
_SIMD_STORE(&q[(ldq*4)+4*offset],q5); _SIMD_STORE(&q[(ldq*4)+4*offset],q5);
_SIMD_STORE(&q[(ldq*4)+5*offset],q6); _SIMD_STORE(&q[(ldq*4)+5*offset],q6);
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h2 = _SIMD_SET1(hh[(ldh)+1]); h2 = _SIMD_SET1(hh[(ldh)+1]);
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
...@@ -3905,7 +3990,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -3905,7 +3990,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(y6, h2)); q6 = _SIMD_SUB(q6, _SIMD_MUL(y6, h2));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h3 = _SIMD_SET1(hh[(ldh*2)+2]); h3 = _SIMD_SET1(hh[(ldh*2)+2]);
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
...@@ -3931,7 +4016,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -3931,7 +4016,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(z6, h3)); q6 = _SIMD_SUB(q6, _SIMD_MUL(z6, h3));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h4 = _SIMD_SET1(hh[(ldh*3)+3]); h4 = _SIMD_SET1(hh[(ldh*3)+3]);
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
...@@ -3957,7 +4042,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -3957,7 +4042,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4)); q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h5 = _SIMD_SET1(hh[(ldh*4)+4]); h5 = _SIMD_SET1(hh[(ldh*4)+4]);
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
...@@ -3983,7 +4068,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -3983,7 +4068,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5)); q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h6 = _SIMD_SET1(hh[(ldh*5)+5]); h6 = _SIMD_SET1(hh[(ldh*5)+5]);
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
...@@ -4157,7 +4242,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -4157,7 +4242,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#endif /* BLOCK4 || BLOCK6 */ #endif /* BLOCK4 || BLOCK6 */
#ifdef BLOCK6 #ifdef BLOCK6
#if VEC_SET == 128 #if VEC_SET == 128 || VEC_SET == 512
h5 = _SIMD_SET1(hh[(ldh*4)+i-1]); h5 = _SIMD_SET1(hh[(ldh*4)+i-1]);
#endif #endif
#if VEC_SET == 1281 #if VEC_SET == 1281
...@@ -4184,7 +4269,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h ...@@ -4184,7 +4269,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5)); q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5));
#endif /* __ELPA_USE_FMA__ */ #endif /* __ELPA_USE_FMA__ */