Commit f71e670e authored by Andreas Marek's avatar Andreas Marek
Browse files

Unify also 512bit BLOCK6 real kernel

parent dfa6ca91
......@@ -775,7 +775,6 @@ EXTRA_DIST = \
src/elpa2/kernels/complex_sse_1hv_template.c \
src/elpa2/kernels/complex_sse_2hv_template.c \
src/elpa2/kernels/complex_template.F90 \
src/elpa2/kernels/real_avx512_6hv_template.c \
src/elpa2/kernels/real_vsx_2hv_template.c \
src/elpa2/kernels/real_vsx_4hv_template.c \
src/elpa2/kernels/real_vsx_6hv_template.c \
......
......@@ -764,7 +764,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine hexa_hh_trafo_real_sse_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> subroutine hexa_hh_trafo_real_SSE_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_SSE_6hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
......@@ -777,7 +777,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
/*
!f>#ifdef HAVE_SPARC64_SSE
!f> interface
!f> subroutine hexa_hh_trafo_real_sparc64_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> subroutine hexa_hh_trafo_real_SPARC64_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_SPARC64_6hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
......@@ -790,7 +790,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine hexa_hh_trafo_real_sse_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> subroutine hexa_hh_trafo_real_SSE_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_SSE_6hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
......@@ -803,7 +803,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
/*
!f>#ifdef HAVE_SPARC64_SSE
!f> interface
!f> subroutine hexa_hh_trafo_real_sparc64_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> subroutine hexa_hh_trafo_real_SPARC64_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_SPARC64_6hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
......@@ -840,7 +840,32 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
!f> end interface
!f>#endif
*/
/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f> subroutine hexa_hh_trafo_real_AVX512_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_AVX512_6hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_double) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f> subroutine hexa_hh_trafo_real_AVX512_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_AVX512_6hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_float) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int* pnb, int* pnq, int* pldq, int* pldh)
{
......@@ -1393,6 +1418,19 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#endif
#endif /* VEC_SET == 256 */
#if VEC_SET == 512
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 32
#define STEP_SIZE 32
#define UPPER_BOUND 24
#endif
#ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 64
#define STEP_SIZE 64
#define UPPER_BOUND 48
#endif
#endif /* VEC_SET == 512 */
for (i = 0; i < nq - UPPER_BOUND; i+= STEP_SIZE)
{
CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_6hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, scalarprods);
......@@ -1422,12 +1460,56 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA
#endif
#endif /* VEC_SET == 256 */
#if VEC_SET == 512
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 24
#endif
#ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 48
#endif
#endif /* VEC_SET == 512 */
if (nq -i == ROW_LENGTH )
{
CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_6hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += ROW_LENGTH;
}
#if VEC_SET == 512
#if VEC_SET == 512
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 16
#endif
#ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 32
#endif
#endif /* VEC_SET == 512 */
if (nq -i == ROW_LENGTH )
{
CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_6hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += ROW_LENGTH;
}
#if VEC_SET == 512
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 8
#endif
#ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 16
#endif
#endif /* VEC_SET == 512 */
if (nq -i == ROW_LENGTH )
{
CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_6hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += ROW_LENGTH;
}
#endif /* VEC_SET == 512 */
#endif /* BLOCK6 */
#ifdef WITH_DEBUG
......@@ -1772,7 +1854,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
__SIMD_DATATYPE a5_1 = _SIMD_LOAD(&q[ldq]);
__SIMD_DATATYPE a6_1 = _SIMD_LOAD(&q[0]);
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
__SIMD_DATATYPE h_6_5 = _SIMD_SET1(hh[(ldh*5)+1]);
__SIMD_DATATYPE h_6_4 = _SIMD_SET1(hh[(ldh*5)+2]);
__SIMD_DATATYPE h_6_3 = _SIMD_SET1(hh[(ldh*5)+3]);
......@@ -1810,7 +1892,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
t1 = _SIMD_ADD(t1, _SIMD_MUL(a1_1, h_6_1));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
__SIMD_DATATYPE h_5_4 = _SIMD_SET1(hh[(ldh*4)+1]);
__SIMD_DATATYPE h_5_3 = _SIMD_SET1(hh[(ldh*4)+2]);
__SIMD_DATATYPE h_5_2 = _SIMD_SET1(hh[(ldh*4)+3]);
......@@ -1843,7 +1925,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
v1 = _SIMD_ADD(v1, _SIMD_MUL(a1_1, h_5_1));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
__SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]);
__SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]);
__SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]);
......@@ -1871,7 +1953,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
__SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]);
__SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]);
__SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]);
......@@ -2245,7 +2327,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#ifdef BLOCK6
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h5 = _SIMD_SET1(hh[(ldh*4)+i-1]);
#endif
#if VEC_SET == 1281
......@@ -2271,7 +2353,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
v6 = _SIMD_ADD(v6, _SIMD_MUL(q6,h5));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h6 = _SIMD_SET1(hh[(ldh*5)+i]);
#endif
......@@ -2496,7 +2578,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#ifdef BLOCK6
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h4 = _SIMD_SET1(hh[(ldh*3)+nb-2]);
#endif
......@@ -2524,7 +2606,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
w6 = _SIMD_ADD(w6, _SIMD_MUL(q6,h4));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]);
#endif
......@@ -2552,7 +2634,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
v6 = _SIMD_ADD(v6, _SIMD_MUL(q6,h5));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h1 = _SIMD_SET1(hh[nb-4]);
#endif
#if VEC_SET == 1281
......@@ -2585,7 +2667,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
x6 = _SIMD_ADD(x6, _SIMD_MUL(q6,h1));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h2 = _SIMD_SET1(hh[ldh+nb-3]);
#endif
#if VEC_SET == 1281
......@@ -2611,7 +2693,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
y6 = _SIMD_ADD(y6, _SIMD_MUL(q6,h2));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h3 = _SIMD_SET1(hh[(ldh*2)+nb-2]);
#endif
#if VEC_SET == 1281
......@@ -2637,7 +2719,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
z6 = _SIMD_ADD(z6, _SIMD_MUL(q6,h3));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h4 = _SIMD_SET1(hh[(ldh*3)+nb-1]);
#endif
#if VEC_SET == 1281
......@@ -2663,7 +2745,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
w6 = _SIMD_ADD(w6, _SIMD_MUL(q6,h4));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h1 = _SIMD_SET1(hh[nb-3]);
#endif
#if VEC_SET == 1281
......@@ -2696,7 +2778,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
x6 = _SIMD_ADD(x6, _SIMD_MUL(q6,h1));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h2 = _SIMD_SET1(hh[ldh+nb-2]);
#endif
......@@ -2724,7 +2806,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
y6 = _SIMD_ADD(y6, _SIMD_MUL(q6,h2));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]);
#endif
......@@ -2752,7 +2834,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
z6 = _SIMD_ADD(z6, _SIMD_MUL(q6,h3));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h1 = _SIMD_SET1(hh[nb-2]);
#endif
......@@ -2786,7 +2868,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
x6 = _SIMD_ADD(x6, _SIMD_MUL(q6,h1));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h2 = _SIMD_SET1(hh[ldh+nb-1]);
#endif
......@@ -2814,7 +2896,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
y6 = _SIMD_ADD(y6, _SIMD_MUL(q6,h2));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h1 = _SIMD_SET1(hh[nb-1]);
#endif
......@@ -3529,7 +3611,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#ifdef BLOCK6
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h6 = _SIMD_SET1(hh[(ldh*5)+1]);
#endif
#if VEC_SET == 1281
......@@ -3574,7 +3656,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
_SIMD_STORE(&q[(ldq+3*offset)],q4);
_SIMD_STORE(&q[(ldq+4*offset)],q5);
_SIMD_STORE(&q[(ldq+5*offset)],q6);
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h5 = _SIMD_SET1(hh[(ldh*4)+1]);
#endif
#if VEC_SET == 1281
......@@ -3613,7 +3696,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h6 = _SIMD_SET1(hh[(ldh*5)+2]);
#endif
#if VEC_SET == 1281
......@@ -3646,7 +3729,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
_SIMD_STORE(&q[(ldq*2)+4*offset],q5);
_SIMD_STORE(&q[(ldq*2)+5*offset],q6);
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h4 = _SIMD_SET1(hh[(ldh*3)+1]);
#endif
......@@ -3687,7 +3770,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h5 = _SIMD_SET1(hh[(ldh*4)+2]);
#endif
#if VEC_SET == 1281
......@@ -3713,7 +3796,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h6 = _SIMD_SET1(hh[(ldh*5)+3]);
#endif
#if VEC_SET == 1281
......@@ -3745,7 +3828,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
_SIMD_STORE(&q[(ldq*3)+3*offset],q4);
_SIMD_STORE(&q[(ldq*3)+4*offset],q5);
_SIMD_STORE(&q[(ldq*3)+5*offset],q6);
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h3 = _SIMD_SET1(hh[(ldh*2)+1]);
#endif
#if VEC_SET == 1281
......@@ -3783,7 +3867,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(z6, h3));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h4 = _SIMD_SET1(hh[(ldh*3)+2]);
#endif
#if VEC_SET == 1281
......@@ -3809,7 +3893,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4));
#endif
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h5 = _SIMD_SET1(hh[(ldh*4)+3]);
#endif
#if VEC_SET == 1281
......@@ -3835,7 +3919,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h6 = _SIMD_SET1(hh[(ldh*5)+4]);
#endif
#if VEC_SET == 1281
......@@ -3867,7 +3951,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
_SIMD_STORE(&q[(ldq*4)+3*offset],q4);
_SIMD_STORE(&q[(ldq*4)+4*offset],q5);
_SIMD_STORE(&q[(ldq*4)+5*offset],q6);
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h2 = _SIMD_SET1(hh[(ldh)+1]);
#endif
#if VEC_SET == 1281
......@@ -3905,7 +3990,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(y6, h2));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h3 = _SIMD_SET1(hh[(ldh*2)+2]);
#endif
#if VEC_SET == 1281
......@@ -3931,7 +4016,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(z6, h3));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h4 = _SIMD_SET1(hh[(ldh*3)+3]);
#endif
#if VEC_SET == 1281
......@@ -3957,7 +4042,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h5 = _SIMD_SET1(hh[(ldh*4)+4]);
#endif
#if VEC_SET == 1281
......@@ -3983,7 +4068,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h6 = _SIMD_SET1(hh[(ldh*5)+5]);
#endif
#if VEC_SET == 1281
......@@ -4157,7 +4242,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#endif /* BLOCK4 || BLOCK6 */
#ifdef BLOCK6
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h5 = _SIMD_SET1(hh[(ldh*4)+i-1]);
#endif
#if VEC_SET == 1281
......@@ -4184,7 +4269,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h6 = _SIMD_SET1(hh[(ldh*5)+i]);
#endif
#if VEC_SET == 1281
......@@ -4332,7 +4417,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#endif /* BLOCK4 || BLOCK6 */
#ifdef BLOCK6
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h4 = _SIMD_SET1(hh[(ldh*3)+nb-2]);
#endif
#if VEC_SET == 1281
......@@ -4358,7 +4443,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]);
#endif
#if VEC_SET == 1281
......@@ -4459,7 +4544,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#endif
#ifdef BLOCK6
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h3 = _SIMD_SET1(hh[(ldh*2)+nb-2]);
#endif
#if VEC_SET == 1281
......@@ -4485,7 +4570,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(z6, h3));
#endif
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h4 = _SIMD_SET1(hh[(ldh*3)+nb-1]);
#endif
#if VEC_SET == 1281
......@@ -4555,7 +4640,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#endif
#ifdef BLOCK6
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h2 = _SIMD_SET1(hh[ldh+nb-2]);
#endif
#if VEC_SET == 1281
......@@ -4582,7 +4667,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(y6, h2));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]);
#endif
#if VEC_SET == 1281
......@@ -4620,7 +4705,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#endif /* BLOCK4 || BLOCK6 */
#ifdef BLOCK6
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h1 = _SIMD_SET1(hh[nb-2]);
#endif
#if VEC_SET == 1281
......@@ -4653,7 +4738,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
q6 = _SIMD_SUB(q6, _SIMD_MUL(x6, h1));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h2 = _SIMD_SET1(hh[ldh+nb-1]);
#endif
#if VEC_SET == 1281
......@@ -4685,7 +4770,8 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
_SIMD_STORE(&q[((nb+3)*ldq)+3*offset],q4);
_SIMD_STORE(&q[((nb+3)*ldq)+4*offset],q5);
_SIMD_STORE(&q[((nb+3)*ldq)+5*offset],q6);
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h1 = _SIMD_SET1(hh[nb-1]);
#endif
#if VEC_SET == 1281
......@@ -5031,7 +5117,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
__SIMD_DATATYPE a5_1 = _SIMD_LOAD(&q[ldq]);
__SIMD_DATATYPE a6_1 = _SIMD_LOAD(&q[0]);
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
__SIMD_DATATYPE h_6_5 = _SIMD_SET1(hh[(ldh*5)+1]);
__SIMD_DATATYPE h_6_4 = _SIMD_SET1(hh[(ldh*5)+2]);
__SIMD_DATATYPE h_6_3 = _SIMD_SET1(hh[(ldh*5)+3]);
......@@ -5069,7 +5155,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
t1 = _SIMD_ADD(t1, _SIMD_MUL(a1_1, h_6_1));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
__SIMD_DATATYPE h_5_4 = _SIMD_SET1(hh[(ldh*4)+1]);
__SIMD_DATATYPE h_5_3 = _SIMD_SET1(hh[(ldh*4)+2]);
__SIMD_DATATYPE h_5_2 = _SIMD_SET1(hh[(ldh*4)+3]);
......@@ -5103,7 +5189,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
v1 = _SIMD_ADD(v1, _SIMD_MUL(a1_1, h_5_1));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
__SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]);
__SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]);
__SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]);
......@@ -5131,7 +5217,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
__SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]);
__SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]);
__SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]);
......@@ -5451,7 +5537,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#ifdef BLOCK6
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h5 = _SIMD_SET1(hh[(ldh*4)+i-1]);
#endif
#if VEC_SET == 1281
......@@ -5475,7 +5561,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
v5 = _SIMD_ADD(v5, _SIMD_MUL(q5,h5));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h6 = _SIMD_SET1(hh[(ldh*5)+i]);
#endif
......@@ -5683,7 +5769,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
#ifdef BLOCK6
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h4 = _SIMD_SET1(hh[(ldh*3)+nb-2]);
#endif
......@@ -5709,7 +5795,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
w5 = _SIMD_ADD(w5, _SIMD_MUL(q5,h4));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]);
#endif
......@@ -5735,7 +5821,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
v5 = _SIMD_ADD(v5, _SIMD_MUL(q5,h5));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h1 = _SIMD_SET1(hh[nb-4]);
#endif
#if VEC_SET == 1281
......@@ -5765,7 +5851,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h2 = _SIMD_SET1(hh[ldh+nb-3]);
#endif
#if VEC_SET == 1281
......@@ -5789,7 +5875,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
y5 = _SIMD_ADD(y5, _SIMD_MUL(q5,h2));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h3 = _SIMD_SET1(hh[(ldh*2)+nb-2]);
#endif
#if VEC_SET == 1281
......@@ -5813,7 +5899,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
z5 = _SIMD_ADD(z5, _SIMD_MUL(q5,h3));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h4 = _SIMD_SET1(hh[(ldh*3)+nb-1]);
#endif
#if VEC_SET == 1281
......@@ -5837,7 +5923,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
w5 = _SIMD_ADD(w5, _SIMD_MUL(q5,h4));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h1 = _SIMD_SET1(hh[nb-3]);
#endif
#if VEC_SET == 1281
......@@ -5867,7 +5953,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h2 = _SIMD_SET1(hh[ldh+nb-2]);
#endif
......@@ -5893,7 +5979,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h
y5 = _SIMD_ADD(y5, _SIMD_MUL(q5,h2));
#endif /* __ELPA_USE_FMA__ */
#if VEC_SET == 128
#if VEC_SET == 128 || VEC_SET == 512
h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]);
#endif
......@@ -5919,7 +6005,7 @@ __forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,h