diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 311f46a664fa98c506dc44388f3643a62c91b9b5..3844f97a482422d84f027a3a7d1c5de0aa1618ef 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -2192,6 +2192,19 @@ intel-double-precision-mpi-noopenmp-ftimings-redirect-real-avx512_block6-complex - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH - make check TEST_FLAGS='1000 500 128' +intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx512_block6-complex-avx512_block1-kernel-jobs: + tags: + - KNL + script: + - ./autogen.sh + - ./configure FC=mpiifort CC=mpiicc CFLAGS="-O3 -mtune=knl -axMIC-AVX512" FCFLAGS="-O3 -mtune=knl -axMIC-AVX512" SCALAPACK_FCFLAGS="-L$MKLROOT/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" SCALAPACK_LDFLAGS="-L$MKLROOT/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKLROOT/lib/intel64" --with-real-avx512_block6-kernel-only --with-complex-avx512_block1-kernel-only --enable-single-precision + - /home/elpa/wait_until_midnight.sh + - make -j 8 + - export OMP_NUM_THREADS=1 + - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH + - make check TEST_FLAGS='1000 500 128' + + intel-set-kernel-via-environment-variable-mpi-openmp-job: tags: - cpu diff --git a/Makefile.am b/Makefile.am index 3ce745876ce9a0d05ad402d4d78cbb3f5c3069d5..6273169bfea31d3369efd0bb52e80e2841ffedd8 100644 --- a/Makefile.am +++ b/Makefile.am @@ -204,9 +204,9 @@ endif if WITH_REAL_AVX512_BLOCK6_KERNEL libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_double_precision.c -#if WANT_SINGLE_PRECISION_REAL -# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_single_precision.c -#endif +if WANT_SINGLE_PRECISION_REAL + libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_single_precision.c +endif endif diff --git a/src/elpa2_kernels/elpa2_kernels_real_avx512_4hv_single_precision.c b/src/elpa2_kernels/elpa2_kernels_real_avx512_4hv_single_precision.c index 511267abb1571142006c6af85b5fed8b90c84a4d..eeeed2be1a62dbd5f43fac43ff2af9b918756d45 100644 --- a/src/elpa2_kernels/elpa2_kernels_real_avx512_4hv_single_precision.c +++ b/src/elpa2_kernels/elpa2_kernels_real_avx512_4hv_single_precision.c @@ -495,7 +495,7 @@ __forceinline void hh_trafo_kernel_64_AVX512_4hv_single(float* q, float* hh, int q3 = _mm512_NFMA_ps(x3, h1, q3); q3 = _mm512_NFMA_ps(y3, h2, q3); q3 = _mm512_NFMA_ps(z3, h3, q3); - q3 = _mm512_NFMA_pd(w3, h4, q3); + q3 = _mm512_NFMA_ps(w3, h4, q3); _mm512_store_ps(&q[(i*ldq)+32],q3); q4 = _mm512_load_ps(&q[(i*ldq)+48]); @@ -932,7 +932,7 @@ __forceinline void hh_trafo_kernel_48_AVX512_4hv_single(float* q, float* hh, int q3 = _mm512_NFMA_ps(x3, h1, q3); q3 = _mm512_NFMA_ps(y3, h2, q3); q3 = _mm512_NFMA_ps(z3, h3, q3); - q3 = _mm512_NFMA_pd(w3, h4, q3); + q3 = _mm512_NFMA_ps(w3, h4, q3); _mm512_store_ps(&q[(i*ldq)+32],q3); // q4 = _mm512_load_ps(&q[(i*ldq)+48]); @@ -1369,7 +1369,7 @@ __forceinline void hh_trafo_kernel_32_AVX512_4hv_single(float* q, float* hh, int // q3 = _mm512_NFMA_ps(x3, h1, q3); // q3 = _mm512_NFMA_ps(y3, h2, q3); // q3 = _mm512_NFMA_ps(z3, h3, q3); - // q3 = _mm512_NFMA_pd(w3, h4, q3); + // q3 = _mm512_NFMA_ps(w3, h4, q3); // _mm512_store_ps(&q[(i*ldq)+32],q3); // q4 = _mm512_load_ps(&q[(i*ldq)+48]); @@ -1806,7 +1806,7 @@ __forceinline void hh_trafo_kernel_16_AVX512_4hv_single(float* q, float* hh, int // q3 = _mm512_NFMA_ps(x3, h1, q3); // q3 = _mm512_NFMA_ps(y3, h2, q3); // q3 = _mm512_NFMA_ps(z3, h3, q3); - // q3 = _mm512_NFMA_pd(w3, h4, q3); + // q3 = _mm512_NFMA_ps(w3, h4, q3); // _mm512_store_ps(&q[(i*ldq)+32],q3); // q4 = _mm512_load_ps(&q[(i*ldq)+48]); diff --git a/src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_single_precision.c b/src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_single_precision.c index b930d06b2407e976d3b4529e3d9d1e4d2afb92ea..760d847ab7de4578ec38850105453eced7969ae0 100644 --- a/src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_single_precision.c +++ b/src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_single_precision.c @@ -42,47 +42,44 @@ // any derivatives of ELPA under the same license that we chose for // the original distribution, the GNU Lesser General Public License. // -// Author: Andreas Marek, MPCDF, based on the double precision case of A. Heinecke -// -#include "config-f90.h" +// Author: Andreas Marek (andreas.marek@mpcdf.mpg.de) +// -------------------------------------------------------------------------------------------------- +#include "config-f90.h" #include #define __forceinline __attribute__((always_inline)) static -#ifdef HAVE_AVX2 +#ifdef HAVE_AVX512 -#ifdef __FMA4__ #define __ELPA_USE_FMA__ -#define _mm256_FMA_ps(a,b,c) _mm256_macc_ps(a,b,c) -#define _mm256_NFMA_ps(a,b,c) _mm256_nmacc_ps(a,b,c) -#define _mm256_FMSUB_ps(a,b,c) _mm256_msub_ps(a,b,c) -#endif +#define _mm512_FMA_ps(a,b,c) _mm512_fmadd_ps(a,b,c) +#define _mm512_NFMA_ps(a,b,c) _mm512_fnmadd_ps(a,b,c) +#define _mm512_FMSUB_ps(a,b,c) _mm512_fmsub_ps(a,b,c) -#ifdef __AVX2__ -#define __ELPA_USE_FMA__ -#define _mm256_FMA_ps(a,b,c) _mm256_fmadd_ps(a,b,c) -#define _mm256_NFMA_ps(a,b,c) _mm256_fnmadd_ps(a,b,c) -#define _mm256_FMSUB_ps(a,b,c) _mm256_fmsub_ps(a,b,c) #endif -#endif //Forward declaration -static void hh_trafo_kernel_4_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods); -static void hh_trafo_kernel_8_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods); +//static void hh_trafo_kernel_4_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods); -void hexa_hh_trafo_real_avx512_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh); +//static void hh_trafo_kernel_8_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods); +static void hh_trafo_kernel_16_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods); +//static void hh_trafo_kernel_24_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods); +static void hh_trafo_kernel_32_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods); +static void hh_trafo_kernel_48_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods); +static void hh_trafo_kernel_64_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods); +void hexa_hh_trafo_real_avx512_6hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh); /* -!f>#ifdef HAVE_AVX512 +!f>#if defined(HAVE_AVX512) !f> interface !f> subroutine hexa_hh_trafo_real_avx512_6hv_single(q, hh, pnb, pnq, pldq, pldh) & -!f> bind(C, name="hexa_hh_trafo_real_512_6hv_single") +!f> bind(C, name="hexa_hh_trafo_real_avx512_6hv_single") !f> use, intrinsic :: iso_c_binding !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh !f> type(c_ptr), value :: q -!f> real(kind=c_float) :: hh(pnb,6) +!f> real(kind=c_float) :: hh(pnb,6) !f> end subroutine !f> end interface !f>#endif @@ -195,44 +192,39 @@ void hexa_hh_trafo_real_avx512_6hv_single(float* q, float* hh, int* pnb, int* pn scalarprods[10] += hh[i-5] * hh[i+(ldh*5)]; } + // Production level kernel calls with padding -#ifdef __AVX__ - for (i = 0; i < nq-4; i+=8) + for (i = 0; i < nq-48; i+=64) { - hh_trafo_kernel_8_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); + hh_trafo_kernel_64_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); } if (nq == i) { return; } - else - { - hh_trafo_kernel_4_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); - } -#else - for (i = 0; i < nq-2; i+=4) + if (nq-i == 48) { - hh_trafo_kernel_4_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); + hh_trafo_kernel_48_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); } - if (nq == i) + if (nq-i == 32) { - return; + hh_trafo_kernel_32_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); } - else + else { - hh_trafo_kernel_2_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); + hh_trafo_kernel_16_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); } -#endif + } /** * Unrolled kernel that computes - * 8 rows of Q simultaneously, a + * 64 rows of Q simultaneously, a * matrix vector product with two householder * vectors + a rank 1 update is performed */ -__forceinline void hh_trafo_kernel_8_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods) +__forceinline void hh_trafo_kernel_64_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [8 x nb+3] * hh @@ -240,772 +232,868 @@ __forceinline void hh_trafo_kernel_8_AVX512_6hv_single(float* q, float* hh, int ///////////////////////////////////////////////////// int i; - __m256 a1_1 = _mm256_load_ps(&q[ldq*5]); - __m256 a2_1 = _mm256_load_ps(&q[ldq*4]); - __m256 a3_1 = _mm256_load_ps(&q[ldq*3]); - __m256 a4_1 = _mm256_load_ps(&q[ldq*2]); - __m256 a5_1 = _mm256_load_ps(&q[ldq]); - __m256 a6_1 = _mm256_load_ps(&q[0]); // q(1,1) | q(2,1) | q(3,1) | q(4,1) - - __m256 h_6_5 = _mm256_broadcast_ss(&hh[(ldh*5)+1]); - __m256 h_6_4 = _mm256_broadcast_ss(&hh[(ldh*5)+2]); - __m256 h_6_3 = _mm256_broadcast_ss(&hh[(ldh*5)+3]); - __m256 h_6_2 = _mm256_broadcast_ss(&hh[(ldh*5)+4]); - __m256 h_6_1 = _mm256_broadcast_ss(&hh[(ldh*5)+5]); -#ifdef __ELPA_USE_FMA__ - register __m256 t1 = _mm256_FMA_ps(a5_1, h_6_5, a6_1); - t1 = _mm256_FMA_ps(a4_1, h_6_4, t1); - t1 = _mm256_FMA_ps(a3_1, h_6_3, t1); - t1 = _mm256_FMA_ps(a2_1, h_6_2, t1); - t1 = _mm256_FMA_ps(a1_1, h_6_1, t1); -#else - register __m256 t1 = _mm256_add_ps(a6_1, _mm256_mul_ps(a5_1, h_6_5)); - t1 = _mm256_add_ps(t1, _mm256_mul_ps(a4_1, h_6_4)); - t1 = _mm256_add_ps(t1, _mm256_mul_ps(a3_1, h_6_3)); - t1 = _mm256_add_ps(t1, _mm256_mul_ps(a2_1, h_6_2)); - t1 = _mm256_add_ps(t1, _mm256_mul_ps(a1_1, h_6_1)); -#endif - __m256 h_5_4 = _mm256_broadcast_ss(&hh[(ldh*4)+1]); - __m256 h_5_3 = _mm256_broadcast_ss(&hh[(ldh*4)+2]); - __m256 h_5_2 = _mm256_broadcast_ss(&hh[(ldh*4)+3]); - __m256 h_5_1 = _mm256_broadcast_ss(&hh[(ldh*4)+4]); -#ifdef __ELPA_USE_FMA__ - register __m256 v1 = _mm256_FMA_ps(a4_1, h_5_4, a5_1); - v1 = _mm256_FMA_ps(a3_1, h_5_3, v1); - v1 = _mm256_FMA_ps(a2_1, h_5_2, v1); - v1 = _mm256_FMA_ps(a1_1, h_5_1, v1); -#else - register __m256 v1 = _mm256_add_ps(a5_1, _mm256_mul_ps(a4_1, h_5_4)); - v1 = _mm256_add_ps(v1, _mm256_mul_ps(a3_1, h_5_3)); - v1 = _mm256_add_ps(v1, _mm256_mul_ps(a2_1, h_5_2)); - v1 = _mm256_add_ps(v1, _mm256_mul_ps(a1_1, h_5_1)); -#endif - __m256 h_4_3 = _mm256_broadcast_ss(&hh[(ldh*3)+1]); - __m256 h_4_2 = _mm256_broadcast_ss(&hh[(ldh*3)+2]); - __m256 h_4_1 = _mm256_broadcast_ss(&hh[(ldh*3)+3]); -#ifdef __ELPA_USE_FMA__ - register __m256 w1 = _mm256_FMA_ps(a3_1, h_4_3, a4_1); - w1 = _mm256_FMA_ps(a2_1, h_4_2, w1); - w1 = _mm256_FMA_ps(a1_1, h_4_1, w1); -#else - register __m256 w1 = _mm256_add_ps(a4_1, _mm256_mul_ps(a3_1, h_4_3)); - w1 = _mm256_add_ps(w1, _mm256_mul_ps(a2_1, h_4_2)); - w1 = _mm256_add_ps(w1, _mm256_mul_ps(a1_1, h_4_1)); -#endif - __m256 h_2_1 = _mm256_broadcast_ss(&hh[ldh+1]); - __m256 h_3_2 = _mm256_broadcast_ss(&hh[(ldh*2)+1]); - __m256 h_3_1 = _mm256_broadcast_ss(&hh[(ldh*2)+2]); -#ifdef __ELPA_USE_FMA__ - register __m256 z1 = _mm256_FMA_ps(a2_1, h_3_2, a3_1); - z1 = _mm256_FMA_ps(a1_1, h_3_1, z1); - register __m256 y1 = _mm256_FMA_ps(a1_1, h_2_1, a2_1); -#else - register __m256 z1 = _mm256_add_ps(a3_1, _mm256_mul_ps(a2_1, h_3_2)); - z1 = _mm256_add_ps(z1, _mm256_mul_ps(a1_1, h_3_1)); - register __m256 y1 = _mm256_add_ps(a2_1, _mm256_mul_ps(a1_1, h_2_1)); -#endif - register __m256 x1 = a1_1; - - -// __m256d a1_2 = _mm256_load_pd(&q[(ldq*5)+4]); -// __m256d a2_2 = _mm256_load_pd(&q[(ldq*4)+4]); -// __m256d a3_2 = _mm256_load_pd(&q[(ldq*3)+4]); -// __m256d a4_2 = _mm256_load_pd(&q[(ldq*2)+4]); -// __m256d a5_2 = _mm256_load_pd(&q[(ldq)+4]); -// __m256d a6_2 = _mm256_load_pd(&q[4]); - -#ifdef __ELPA_USE_FMA__ -// register __m256d t2 = _mm256_FMA_pd(a5_2, h_6_5, a6_2); -// t2 = _mm256_FMA_pd(a4_2, h_6_4, t2); -// t2 = _mm256_FMA_pd(a3_2, h_6_3, t2); -// t2 = _mm256_FMA_pd(a2_2, h_6_2, t2); -// t2 = _mm256_FMA_pd(a1_2, h_6_1, t2); -// register __m256d v2 = _mm256_FMA_pd(a4_2, h_5_4, a5_2); -// v2 = _mm256_FMA_pd(a3_2, h_5_3, v2); -// v2 = _mm256_FMA_pd(a2_2, h_5_2, v2); -// v2 = _mm256_FMA_pd(a1_2, h_5_1, v2); -// register __m256d w2 = _mm256_FMA_pd(a3_2, h_4_3, a4_2); -// w2 = _mm256_FMA_pd(a2_2, h_4_2, w2); -// w2 = _mm256_FMA_pd(a1_2, h_4_1, w2); -// register __m256d z2 = _mm256_FMA_pd(a2_2, h_3_2, a3_2); -// z2 = _mm256_FMA_pd(a1_2, h_3_1, z2); -// register __m256d y2 = _mm256_FMA_pd(a1_2, h_2_1, a2_2); -#else -// register __m256d t2 = _mm256_add_pd(a6_2, _mm256_mul_pd(a5_2, h_6_5)); -// t2 = _mm256_add_pd(t2, _mm256_mul_pd(a4_2, h_6_4)); -// t2 = _mm256_add_pd(t2, _mm256_mul_pd(a3_2, h_6_3)); -// t2 = _mm256_add_pd(t2, _mm256_mul_pd(a2_2, h_6_2)); -// t2 = _mm256_add_pd(t2, _mm256_mul_pd(a1_2, h_6_1)); -// register __m256d v2 = _mm256_add_pd(a5_2, _mm256_mul_pd(a4_2, h_5_4)); -// v2 = _mm256_add_pd(v2, _mm256_mul_pd(a3_2, h_5_3)); -// v2 = _mm256_add_pd(v2, _mm256_mul_pd(a2_2, h_5_2)); -// v2 = _mm256_add_pd(v2, _mm256_mul_pd(a1_2, h_5_1)); -// register __m256d w2 = _mm256_add_pd(a4_2, _mm256_mul_pd(a3_2, h_4_3)); -// w2 = _mm256_add_pd(w2, _mm256_mul_pd(a2_2, h_4_2)); -// w2 = _mm256_add_pd(w2, _mm256_mul_pd(a1_2, h_4_1)); -// register __m256d z2 = _mm256_add_pd(a3_2, _mm256_mul_pd(a2_2, h_3_2)); -// z2 = _mm256_add_pd(z2, _mm256_mul_pd(a1_2, h_3_1)); -// register __m256d y2 = _mm256_add_pd(a2_2, _mm256_mul_pd(a1_2, h_2_1)); -#endif -// register __m256d x2 = a1_2; + __m512 a1_1 = _mm512_load_ps(&q[ldq*5]); + __m512 a2_1 = _mm512_load_ps(&q[ldq*4]); + __m512 a3_1 = _mm512_load_ps(&q[ldq*3]); + __m512 a4_1 = _mm512_load_ps(&q[ldq*2]); + __m512 a5_1 = _mm512_load_ps(&q[ldq]); + __m512 a6_1 = _mm512_load_ps(&q[0]); + + __m512 h_6_5 = _mm512_set1_ps(hh[(ldh*5)+1]); + __m512 h_6_4 = _mm512_set1_ps(hh[(ldh*5)+2]); + __m512 h_6_3 = _mm512_set1_ps(hh[(ldh*5)+3]); + __m512 h_6_2 = _mm512_set1_ps(hh[(ldh*5)+4]); + __m512 h_6_1 = _mm512_set1_ps(hh[(ldh*5)+5]); + +// register __m512d t1 = _mm512_FMA_ps(a5_1, h_6_5, a6_1); + __m512 t1 = _mm512_FMA_ps(a5_1, h_6_5, a6_1); + + t1 = _mm512_FMA_ps(a4_1, h_6_4, t1); + t1 = _mm512_FMA_ps(a3_1, h_6_3, t1); + t1 = _mm512_FMA_ps(a2_1, h_6_2, t1); + t1 = _mm512_FMA_ps(a1_1, h_6_1, t1); + + __m512 h_5_4 = _mm512_set1_ps(hh[(ldh*4)+1]); + __m512 h_5_3 = _mm512_set1_ps(hh[(ldh*4)+2]); + __m512 h_5_2 = _mm512_set1_ps(hh[(ldh*4)+3]); + __m512 h_5_1 = _mm512_set1_ps(hh[(ldh*4)+4]); + +// register __m512d v1 = _mm512_FMA_ps(a4_1, h_5_4, a5_1); + __m512 v1 = _mm512_FMA_ps(a4_1, h_5_4, a5_1); + + v1 = _mm512_FMA_ps(a3_1, h_5_3, v1); + v1 = _mm512_FMA_ps(a2_1, h_5_2, v1); + v1 = _mm512_FMA_ps(a1_1, h_5_1, v1); + + __m512 h_4_3 = _mm512_set1_ps(hh[(ldh*3)+1]); + __m512 h_4_2 = _mm512_set1_ps(hh[(ldh*3)+2]); + __m512 h_4_1 = _mm512_set1_ps(hh[(ldh*3)+3]); + +// register __m512d w1 = _mm512_FMA_ps(a3_1, h_4_3, a4_1); + __m512 w1 = _mm512_FMA_ps(a3_1, h_4_3, a4_1); + + w1 = _mm512_FMA_ps(a2_1, h_4_2, w1); + w1 = _mm512_FMA_ps(a1_1, h_4_1, w1); + + __m512 h_2_1 = _mm512_set1_ps(hh[ldh+1]); + __m512 h_3_2 = _mm512_set1_ps(hh[(ldh*2)+1]); + __m512 h_3_1 = _mm512_set1_ps(hh[(ldh*2)+2]); + +// register __m512d z1 = _mm512_FMA_ps(a2_1, h_3_2, a3_1); + __m512 z1 = _mm512_FMA_ps(a2_1, h_3_2, a3_1); + + z1 = _mm512_FMA_ps(a1_1, h_3_1, z1); +// register __m512d y1 = _mm512_FMA_ps(a1_1, h_2_1, a2_1); + __m512 y1 = _mm512_FMA_ps(a1_1, h_2_1, a2_1); + + +// register __m512d x1 = a1_1; + __m512 x1 = a1_1; + + + + __m512 a1_2 = _mm512_load_ps(&q[(ldq*5)+16]); + __m512 a2_2 = _mm512_load_ps(&q[(ldq*4)+16]); + __m512 a3_2 = _mm512_load_ps(&q[(ldq*3)+16]); + __m512 a4_2 = _mm512_load_ps(&q[(ldq*2)+16]); + __m512 a5_2 = _mm512_load_ps(&q[(ldq)+16]); + __m512 a6_2 = _mm512_load_ps(&q[0+16]); + +// register __m512d t2 = _mm512_FMA_ps(a5_2, h_6_5, a6_2); + __m512 t2 = _mm512_FMA_ps(a5_2, h_6_5, a6_2); + + t2 = _mm512_FMA_ps(a4_2, h_6_4, t2); + t2 = _mm512_FMA_ps(a3_2, h_6_3, t2); + t2 = _mm512_FMA_ps(a2_2, h_6_2, t2); + t2 = _mm512_FMA_ps(a1_2, h_6_1, t2); + +// register __m512d v2 = _mm512_FMA_ps(a4_2, h_5_4, a5_2); + __m512 v2 = _mm512_FMA_ps(a4_2, h_5_4, a5_2); + + v2 = _mm512_FMA_ps(a3_2, h_5_3, v2); + v2 = _mm512_FMA_ps(a2_2, h_5_2, v2); + v2 = _mm512_FMA_ps(a1_2, h_5_1, v2); + +// register __m512d w2 = _mm512_FMA_ps(a3_2, h_4_3, a4_2); + __m512 w2 = _mm512_FMA_ps(a3_2, h_4_3, a4_2); + + w2 = _mm512_FMA_ps(a2_2, h_4_2, w2); + w2 = _mm512_FMA_ps(a1_2, h_4_1, w2); + +// register __m512d z2 = _mm512_FMA_ps(a2_2, h_3_2, a3_2); + __m512 z2 = _mm512_FMA_ps(a2_2, h_3_2, a3_2); + + z2 = _mm512_FMA_ps(a1_2, h_3_1, z2); +// register __m512d y2 = _mm512_FMA_ps(a1_2, h_2_1, a2_2); + __m512 y2 = _mm512_FMA_ps(a1_2, h_2_1, a2_2); + + +// register __m512d x2 = a1_2; + __m512 x2 = a1_2; + + + __m512 a1_3 = _mm512_load_ps(&q[(ldq*5)+32]); + __m512 a2_3 = _mm512_load_ps(&q[(ldq*4)+32]); + __m512 a3_3 = _mm512_load_ps(&q[(ldq*3)+32]); + __m512 a4_3 = _mm512_load_ps(&q[(ldq*2)+32]); + __m512 a5_3 = _mm512_load_ps(&q[(ldq)+32]); + __m512 a6_3 = _mm512_load_ps(&q[0+32]); + +// register __m512d t3 = _mm512_FMA_ps(a5_3, h_6_5, a6_3); + __m512 t3 = _mm512_FMA_ps(a5_3, h_6_5, a6_3); + + t3 = _mm512_FMA_ps(a4_3, h_6_4, t3); + t3 = _mm512_FMA_ps(a3_3, h_6_3, t3); + t3 = _mm512_FMA_ps(a2_3, h_6_2, t3); + t3 = _mm512_FMA_ps(a1_3, h_6_1, t3); + +// register __m512d v3 = _mm512_FMA_ps(a4_3, h_5_4, a5_3); + __m512 v3 = _mm512_FMA_ps(a4_3, h_5_4, a5_3); + + v3 = _mm512_FMA_ps(a3_3, h_5_3, v3); + v3 = _mm512_FMA_ps(a2_3, h_5_2, v3); + v3 = _mm512_FMA_ps(a1_3, h_5_1, v3); + +// register __m512d w3 = _mm512_FMA_ps(a3_3, h_4_3, a4_3); + __m512 w3 = _mm512_FMA_ps(a3_3, h_4_3, a4_3); + + w3 = _mm512_FMA_ps(a2_3, h_4_2, w3); + w3 = _mm512_FMA_ps(a1_3, h_4_1, w3); + +// register __m512d z3 = _mm512_FMA_ps(a2_3, h_3_2, a3_3); + __m512 z3 = _mm512_FMA_ps(a2_3, h_3_2, a3_3); + + z3 = _mm512_FMA_ps(a1_3, h_3_1, z3); +// register __m512d y3 = _mm512_FMA_ps(a1_3, h_2_1, a2_3); + __m512 y3 = _mm512_FMA_ps(a1_3, h_2_1, a2_3); + + +// register __m512d x3 = a1_3; + __m512 x3 = a1_3; + + + __m512 a1_4 = _mm512_load_ps(&q[(ldq*5)+48]); + __m512 a2_4 = _mm512_load_ps(&q[(ldq*4)+48]); + __m512 a3_4 = _mm512_load_ps(&q[(ldq*3)+48]); + __m512 a4_4 = _mm512_load_ps(&q[(ldq*2)+48]); + __m512 a5_4 = _mm512_load_ps(&q[(ldq)+48]); + __m512 a6_4 = _mm512_load_ps(&q[0+48]); + +// register __m512d t4 = _mm512_FMA_ps(a5_4, h_6_5, a6_4); + __m512 t4 = _mm512_FMA_ps(a5_4, h_6_5, a6_4); + + t4 = _mm512_FMA_ps(a4_4, h_6_4, t4); + t4 = _mm512_FMA_ps(a3_4, h_6_3, t4); + t4 = _mm512_FMA_ps(a2_4, h_6_2, t4); + t4 = _mm512_FMA_ps(a1_4, h_6_1, t4); + +// register __m512d v4 = _mm512_FMA_ps(a4_4, h_5_4, a5_4); + __m512 v4 = _mm512_FMA_ps(a4_4, h_5_4, a5_4); + + v4 = _mm512_FMA_ps(a3_4, h_5_3, v4); + v4 = _mm512_FMA_ps(a2_4, h_5_2, v4); + v4 = _mm512_FMA_ps(a1_4, h_5_1, v4); + +// register __m512d w4 = _mm512_FMA_ps(a3_4, h_4_3, a4_4); + __m512 w4 = _mm512_FMA_ps(a3_4, h_4_3, a4_4); - __m256 q1; -// __m256d q2; + w4 = _mm512_FMA_ps(a2_4, h_4_2, w4); + w4 = _mm512_FMA_ps(a1_4, h_4_1, w4); - __m256 h1; - __m256 h2; - __m256 h3; - __m256 h4; - __m256 h5; - __m256 h6; +// register __m512d z4 = _mm512_FMA_ps(a2_4, h_3_2, a3_4); + __m512 z4 = _mm512_FMA_ps(a2_4, h_3_2, a3_4); + + z4 = _mm512_FMA_ps(a1_4, h_3_1, z4); +// register __m512d y4 = _mm512_FMA_ps(a1_4, h_2_1, a2_4); + __m512 y4 = _mm512_FMA_ps(a1_4, h_2_1, a2_4); + + +// register __m512d x4 = a1_4; + __m512 x4 = a1_4; + + + __m512 q1; + __m512 q2; + __m512 q3; + __m512 q4; + + __m512 h1; + __m512 h2; + __m512 h3; + __m512 h4; + __m512 h5; + __m512 h6; for(i = 6; i < nb; i++) { - h1 = _mm256_broadcast_ss(&hh[i-5]); - q1 = _mm256_load_ps(&q[i*ldq]); -// q2 = _mm256_load_pd(&q[(i*ldq)+4]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm256_FMA_ps(q1, h1, x1); -// x2 = _mm256_FMA_pd(q2, h1, x2); -#else - x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1)); -// x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); -#endif - h2 = _mm256_broadcast_ss(&hh[ldh+i-4]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm256_FMA_ps(q1, h2, y1); -// y2 = _mm256_FMA_pd(q2, h2, y2); -#else - y1 = _mm256_add_ps(y1, _mm256_mul_ps(q1,h2)); -// y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); -#endif - h3 = _mm256_broadcast_ss(&hh[(ldh*2)+i-3]); -#ifdef __ELPA_USE_FMA__ - z1 = _mm256_FMA_ps(q1, h3, z1); -// z2 = _mm256_FMA_pd(q2, h3, z2); -#else - z1 = _mm256_add_ps(z1, _mm256_mul_ps(q1,h3)); -// z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3)); -#endif - h4 = _mm256_broadcast_ss(&hh[(ldh*3)+i-2]); -#ifdef __ELPA_USE_FMA__ - w1 = _mm256_FMA_ps(q1, h4, w1); -// w2 = _mm256_FMA_pd(q2, h4, w2); -#else - w1 = _mm256_add_ps(w1, _mm256_mul_ps(q1,h4)); -// w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4)); -#endif - h5 = _mm256_broadcast_ss(&hh[(ldh*4)+i-1]); -#ifdef __ELPA_USE_FMA__ - v1 = _mm256_FMA_ps(q1, h5, v1); -// v2 = _mm256_FMA_pd(q2, h5, v2); -#else - v1 = _mm256_add_ps(v1, _mm256_mul_ps(q1,h5)); -// v2 = _mm256_add_pd(v2, _mm256_mul_pd(q2,h5)); -#endif - h6 = _mm256_broadcast_ss(&hh[(ldh*5)+i]); -#ifdef __ELPA_USE_FMA__ - t1 = _mm256_FMA_ps(q1, h6, t1); -// t2 = _mm256_FMA_pd(q2, h6, t2); -#else - t1 = _mm256_add_ps(t1, _mm256_mul_ps(q1,h6)); -// t2 = _mm256_add_pd(t2, _mm256_mul_pd(q2,h6)); -#endif + h1 = _mm512_set1_ps(hh[i-5]); + q1 = _mm512_load_ps(&q[i*ldq]); + q2 = _mm512_load_ps(&q[(i*ldq)+16]); + q3 = _mm512_load_ps(&q[(i*ldq)+32]); + q4 = _mm512_load_ps(&q[(i*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); + x2 = _mm512_FMA_ps(q2, h1, x2); + x3 = _mm512_FMA_ps(q3, h1, x3); + x4 = _mm512_FMA_ps(q4, h1, x4); + + h2 = _mm512_set1_ps(hh[ldh+i-4]); + + y1 = _mm512_FMA_ps(q1, h2, y1); + y2 = _mm512_FMA_ps(q2, h2, y2); + y3 = _mm512_FMA_ps(q3, h2, y3); + y4 = _mm512_FMA_ps(q4, h2, y4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+i-3]); + + z1 = _mm512_FMA_ps(q1, h3, z1); + z2 = _mm512_FMA_ps(q2, h3, z2); + z3 = _mm512_FMA_ps(q3, h3, z3); + z4 = _mm512_FMA_ps(q4, h3, z4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+i-2]); + + w1 = _mm512_FMA_ps(q1, h4, w1); + w2 = _mm512_FMA_ps(q2, h4, w2); + w3 = _mm512_FMA_ps(q3, h4, w3); + w4 = _mm512_FMA_ps(q4, h4, w4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+i-1]); + + v1 = _mm512_FMA_ps(q1, h5, v1); + v2 = _mm512_FMA_ps(q2, h5, v2); + v3 = _mm512_FMA_ps(q3, h5, v3); + v4 = _mm512_FMA_ps(q4, h5, v4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+i]); + + t1 = _mm512_FMA_ps(q1, h6, t1); + t2 = _mm512_FMA_ps(q2, h6, t2); + t3 = _mm512_FMA_ps(q3, h6, t3); + t4 = _mm512_FMA_ps(q4, h6, t4); } - h1 = _mm256_broadcast_ss(&hh[nb-5]); - q1 = _mm256_load_ps(&q[nb*ldq]); -// q2 = _mm256_load_pd(&q[(nb*ldq)+4]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm256_FMA_ps(q1, h1, x1); -// x2 = _mm256_FMA_pd(q2, h1, x2); -#else - x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1)); -// x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); -#endif - h2 = _mm256_broadcast_ss(&hh[ldh+nb-4]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm256_FMA_ps(q1, h2, y1); -// y2 = _mm256_FMA_pd(q2, h2, y2); -#else - y1 = _mm256_add_ps(y1, _mm256_mul_ps(q1,h2)); -// y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); -#endif - h3 = _mm256_broadcast_ss(&hh[(ldh*2)+nb-3]); -#ifdef __ELPA_USE_FMA__ - z1 = _mm256_FMA_ps(q1, h3, z1); -// z2 = _mm256_FMA_pd(q2, h3, z2); -#else - z1 = _mm256_add_ps(z1, _mm256_mul_ps(q1,h3)); -// z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3)); -#endif - h4 = _mm256_broadcast_ss(&hh[(ldh*3)+nb-2]); -#ifdef __ELPA_USE_FMA__ - w1 = _mm256_FMA_ps(q1, h4, w1); -// w2 = _mm256_FMA_pd(q2, h4, w2); -#else - w1 = _mm256_add_ps(w1, _mm256_mul_ps(q1,h4)); -// w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4)); -#endif - h5 = _mm256_broadcast_ss(&hh[(ldh*4)+nb-1]); -#ifdef __ELPA_USE_FMA__ - v1 = _mm256_FMA_ps(q1, h5, v1); -// v2 = _mm256_FMA_pd(q2, h5, v2); -#else - v1 = _mm256_add_ps(v1, _mm256_mul_ps(q1,h5)); -// v2 = _mm256_add_pd(v2, _mm256_mul_pd(q2,h5)); -#endif + h1 = _mm512_set1_ps(hh[nb-5]); + q1 = _mm512_load_ps(&q[nb*ldq]); + q2 = _mm512_load_ps(&q[(nb*ldq)+16]); + q3 = _mm512_load_ps(&q[(nb*ldq)+32]); + q4 = _mm512_load_ps(&q[(nb*ldq)+48]); - h1 = _mm256_broadcast_ss(&hh[nb-4]); - q1 = _mm256_load_ps(&q[(nb+1)*ldq]); -// q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm256_FMA_ps(q1, h1, x1); -// x2 = _mm256_FMA_pd(q2, h1, x2); -#else - x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1)); -// x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); -#endif - h2 = _mm256_broadcast_ss(&hh[ldh+nb-3]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm256_FMA_ps(q1, h2, y1); -// y2 = _mm256_FMA_pd(q2, h2, y2); -#else - y1 = _mm256_add_ps(y1, _mm256_mul_ps(q1,h2)); -// y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); -#endif - h3 = _mm256_broadcast_ss(&hh[(ldh*2)+nb-2]); -#ifdef __ELPA_USE_FMA__ - z1 = _mm256_FMA_ps(q1, h3, z1); -// z2 = _mm256_FMA_pd(q2, h3, z2); -#else - z1 = _mm256_add_ps(z1, _mm256_mul_ps(q1,h3)); -// z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3)); -#endif - h4 = _mm256_broadcast_ss(&hh[(ldh*3)+nb-1]); -#ifdef __ELPA_USE_FMA__ - w1 = _mm256_FMA_ps(q1, h4, w1); -// w2 = _mm256_FMA_pd(q2, h4, w2); -#else - w1 = _mm256_add_ps(w1, _mm256_mul_ps(q1,h4)); -// w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4)); -#endif + x1 = _mm512_FMA_ps(q1, h1, x1); + x2 = _mm512_FMA_ps(q2, h1, x2); + x3 = _mm512_FMA_ps(q3, h1, x3); + x4 = _mm512_FMA_ps(q4, h1, x4); - h1 = _mm256_broadcast_ss(&hh[nb-3]); - q1 = _mm256_load_ps(&q[(nb+2)*ldq]); -// q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm256_FMA_ps(q1, h1, x1); -// x2 = _mm256_FMA_pd(q2, h1, x2); -#else - x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1)); -// x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); -#endif - h2 = _mm256_broadcast_ss(&hh[ldh+nb-2]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm256_FMA_ps(q1, h2, y1); -// y2 = _mm256_FMA_pd(q2, h2, y2); -#else - y1 = _mm256_add_ps(y1, _mm256_mul_ps(q1,h2)); -// y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); -#endif - h3 = _mm256_broadcast_ss(&hh[(ldh*2)+nb-1]); -#ifdef __ELPA_USE_FMA__ - z1 = _mm256_FMA_ps(q1, h3, z1); -// z2 = _mm256_FMA_pd(q2, h3, z2); -#else - z1 = _mm256_add_ps(z1, _mm256_mul_ps(q1,h3)); -// z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3)); -#endif + h2 = _mm512_set1_ps(hh[ldh+nb-4]); - h1 = _mm256_broadcast_ss(&hh[nb-2]); - q1 = _mm256_load_ps(&q[(nb+3)*ldq]); -// q2 = _mm256_load_pd(&q[((nb+3)*ldq)+4]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm256_FMA_ps(q1, h1, x1); -// x2 = _mm256_FMA_pd(q2, h1, x2); -#else - x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1)); -// x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); -#endif - h2 = _mm256_broadcast_ss(&hh[ldh+nb-1]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm256_FMA_ps(q1, h2, y1); -// y2 = _mm256_FMA_pd(q2, h2, y2); -#else - y1 = _mm256_add_ps(y1, _mm256_mul_ps(q1,h2)); -// y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); -#endif + y1 = _mm512_FMA_ps(q1, h2, y1); + y2 = _mm512_FMA_ps(q2, h2, y2); + y3 = _mm512_FMA_ps(q3, h2, y3); + y4 = _mm512_FMA_ps(q4, h2, y4); - h1 = _mm256_broadcast_ss(&hh[nb-1]); - q1 = _mm256_load_ps(&q[(nb+4)*ldq]); -// q2 = _mm256_load_pd(&q[((nb+4)*ldq)+4]); -#ifdef __ELPA_USE_FMA__ - x1 = _mm256_FMA_ps(q1, h1, x1); -// x2 = _mm256_FMA_pd(q2, h1, x2); -#else - x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1)); -// x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); -#endif + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-3]); + + z1 = _mm512_FMA_ps(q1, h3, z1); + z2 = _mm512_FMA_ps(q2, h3, z2); + z3 = _mm512_FMA_ps(q3, h3, z3); + z4 = _mm512_FMA_ps(q4, h3, z4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+nb-2]); + + w1 = _mm512_FMA_ps(q1, h4, w1); + w2 = _mm512_FMA_ps(q2, h4, w2); + w3 = _mm512_FMA_ps(q3, h4, w3); + w4 = _mm512_FMA_ps(q4, h4, w4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+nb-1]); + + v1 = _mm512_FMA_ps(q1, h5, v1); + v2 = _mm512_FMA_ps(q2, h5, v2); + v3 = _mm512_FMA_ps(q3, h5, v3); + v4 = _mm512_FMA_ps(q4, h5, v4); + + h1 = _mm512_set1_ps(hh[nb-4]); + + q1 = _mm512_load_ps(&q[(nb+1)*ldq]); + q2 = _mm512_load_ps(&q[((nb+1)*ldq)+16]); + q3 = _mm512_load_ps(&q[((nb+1)*ldq)+32]); + q4 = _mm512_load_ps(&q[((nb+1)*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); + x2 = _mm512_FMA_ps(q2, h1, x2); + x3 = _mm512_FMA_ps(q3, h1, x3); + x4 = _mm512_FMA_ps(q4, h1, x4); + + h2 = _mm512_set1_ps(hh[ldh+nb-3]); + + y1 = _mm512_FMA_ps(q1, h2, y1); + y2 = _mm512_FMA_ps(q2, h2, y2); + y3 = _mm512_FMA_ps(q3, h2, y3); + y4 = _mm512_FMA_ps(q4, h2, y4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-2]); + + z1 = _mm512_FMA_ps(q1, h3, z1); + z2 = _mm512_FMA_ps(q2, h3, z2); + z3 = _mm512_FMA_ps(q3, h3, z3); + z4 = _mm512_FMA_ps(q4, h3, z4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+nb-1]); + + w1 = _mm512_FMA_ps(q1, h4, w1); + w2 = _mm512_FMA_ps(q2, h4, w2); + w3 = _mm512_FMA_ps(q3, h4, w3); + w4 = _mm512_FMA_ps(q4, h4, w4); + + h1 = _mm512_set1_ps(hh[nb-3]); + q1 = _mm512_load_ps(&q[(nb+2)*ldq]); + q2 = _mm512_load_ps(&q[((nb+2)*ldq)+16]); + q3 = _mm512_load_ps(&q[((nb+2)*ldq)+32]); + q4 = _mm512_load_ps(&q[((nb+2)*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); + x2 = _mm512_FMA_ps(q2, h1, x2); + x3 = _mm512_FMA_ps(q3, h1, x3); + x4 = _mm512_FMA_ps(q4, h1, x4); + + h2 = _mm512_set1_ps(hh[ldh+nb-2]); + + y1 = _mm512_FMA_ps(q1, h2, y1); + y2 = _mm512_FMA_ps(q2, h2, y2); + y3 = _mm512_FMA_ps(q3, h2, y3); + y4 = _mm512_FMA_ps(q4, h2, y4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-1]); + + z1 = _mm512_FMA_ps(q1, h3, z1); + z2 = _mm512_FMA_ps(q2, h3, z2); + z3 = _mm512_FMA_ps(q3, h3, z3); + z4 = _mm512_FMA_ps(q4, h3, z4); + + h1 = _mm512_set1_ps(hh[nb-2]); + q1 = _mm512_load_ps(&q[(nb+3)*ldq]); + q2 = _mm512_load_ps(&q[((nb+3)*ldq)+16]); + q3 = _mm512_load_ps(&q[((nb+3)*ldq)+32]); + q4 = _mm512_load_ps(&q[((nb+3)*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); + x2 = _mm512_FMA_ps(q2, h1, x2); + x3 = _mm512_FMA_ps(q3, h1, x3); + x4 = _mm512_FMA_ps(q4, h1, x4); + + h2 = _mm512_set1_ps(hh[ldh+nb-1]); + + y1 = _mm512_FMA_ps(q1, h2, y1); + y2 = _mm512_FMA_ps(q2, h2, y2); + y3 = _mm512_FMA_ps(q3, h2, y3); + y4 = _mm512_FMA_ps(q4, h2, y4); + + h1 = _mm512_set1_ps(hh[nb-1]); + q1 = _mm512_load_ps(&q[(nb+4)*ldq]); + q2 = _mm512_load_ps(&q[((nb+4)*ldq)+16]); + q3 = _mm512_load_ps(&q[((nb+4)*ldq)+32]); + q4 = _mm512_load_ps(&q[((nb+4)*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); + x2 = _mm512_FMA_ps(q2, h1, x2); + x3 = _mm512_FMA_ps(q3, h1, x3); + x4 = _mm512_FMA_ps(q4, h1, x4); ///////////////////////////////////////////////////// // Apply tau, correct wrong calculation using pre-calculated scalar products ///////////////////////////////////////////////////// - __m256 tau1 = _mm256_broadcast_ss(&hh[0]); - x1 = _mm256_mul_ps(x1, tau1); -// x2 = _mm256_mul_pd(x2, tau1); - - __m256 tau2 = _mm256_broadcast_ss(&hh[ldh]); - __m256 vs_1_2 = _mm256_broadcast_ss(&scalarprods[0]); - h2 = _mm256_mul_ps(tau2, vs_1_2); -#ifdef __ELPA_USE_FMA__ - y1 = _mm256_FMSUB_ps(y1, tau2, _mm256_mul_ps(x1,h2)); -// y2 = _mm256_FMSUB_pd(y2, tau2, _mm256_mul_pd(x2,h2)); -#else - y1 = _mm256_sub_ps(_mm256_mul_ps(y1,tau2), _mm256_mul_ps(x1,h2)); -// y2 = _mm256_sub_pd(_mm256_mul_pd(y2,tau2), _mm256_mul_pd(x2,h2)); -#endif + __m512 tau1 = _mm512_set1_ps(hh[0]); + x1 = _mm512_mul_ps(x1, tau1); + x2 = _mm512_mul_ps(x2, tau1); + x3 = _mm512_mul_ps(x3, tau1); + x4 = _mm512_mul_ps(x4, tau1); - __m256 tau3 = _mm256_broadcast_ss(&hh[ldh*2]); - __m256 vs_1_3 = _mm256_broadcast_ss(&scalarprods[1]); - __m256 vs_2_3 = _mm256_broadcast_ss(&scalarprods[2]); - h2 = _mm256_mul_ps(tau3, vs_1_3); - h3 = _mm256_mul_ps(tau3, vs_2_3); -#ifdef __ELPA_USE_FMA__ - z1 = _mm256_FMSUB_ps(z1, tau3, _mm256_FMA_ps(y1, h3, _mm256_mul_ps(x1,h2))); -// z2 = _mm256_FMSUB_pd(z2, tau3, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2))); -#else - z1 = _mm256_sub_ps(_mm256_mul_ps(z1,tau3), _mm256_add_ps(_mm256_mul_ps(y1,h3), _mm256_mul_ps(x1,h2))); -// z2 = _mm256_sub_pd(_mm256_mul_pd(z2,tau3), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2))); -#endif + __m512 tau2 = _mm512_set1_ps(hh[ldh]); + __m512 vs_1_2 = _mm512_set1_ps(scalarprods[0]); + h2 = _mm512_mul_ps(tau2, vs_1_2); - __m256 tau4 = _mm256_broadcast_ss(&hh[ldh*3]); - __m256 vs_1_4 = _mm256_broadcast_ss(&scalarprods[3]); - __m256 vs_2_4 = _mm256_broadcast_ss(&scalarprods[4]); - h2 = _mm256_mul_ps(tau4, vs_1_4); - h3 = _mm256_mul_ps(tau4, vs_2_4); - __m256 vs_3_4 = _mm256_broadcast_ss(&scalarprods[5]); - h4 = _mm256_mul_ps(tau4, vs_3_4); -#ifdef __ELPA_USE_FMA__ - w1 = _mm256_FMSUB_ps(w1, tau4, _mm256_FMA_ps(z1, h4, _mm256_FMA_ps(y1, h3, _mm256_mul_ps(x1,h2)))); -// w2 = _mm256_FMSUB_pd(w2, tau4, _mm256_FMA_pd(z2, h4, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2)))); -#else - w1 = _mm256_sub_ps(_mm256_mul_ps(w1,tau4), _mm256_add_ps(_mm256_mul_ps(z1,h4), _mm256_add_ps(_mm256_mul_ps(y1,h3), _mm256_mul_ps(x1,h2)))); -// w2 = _mm256_sub_pd(_mm256_mul_pd(w2,tau4), _mm256_add_pd(_mm256_mul_pd(z2,h4), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2)))); -#endif + y1 = _mm512_FMSUB_ps(y1, tau2, _mm512_mul_ps(x1,h2)); + y2 = _mm512_FMSUB_ps(y2, tau2, _mm512_mul_ps(x2,h2)); + y3 = _mm512_FMSUB_ps(y3, tau2, _mm512_mul_ps(x3,h2)); + y4 = _mm512_FMSUB_ps(y4, tau2, _mm512_mul_ps(x4,h2)); - __m256 tau5 = _mm256_broadcast_ss(&hh[ldh*4]); - __m256 vs_1_5 = _mm256_broadcast_ss(&scalarprods[6]); - __m256 vs_2_5 = _mm256_broadcast_ss(&scalarprods[7]); - h2 = _mm256_mul_ps(tau5, vs_1_5); - h3 = _mm256_mul_ps(tau5, vs_2_5); - __m256 vs_3_5 = _mm256_broadcast_ss(&scalarprods[8]); - __m256 vs_4_5 = _mm256_broadcast_ss(&scalarprods[9]); - h4 = _mm256_mul_ps(tau5, vs_3_5); - h5 = _mm256_mul_ps(tau5, vs_4_5); -#ifdef __ELPA_USE_FMA__ - v1 = _mm256_FMSUB_ps(v1, tau5, _mm256_add_ps(_mm256_FMA_ps(w1, h5, _mm256_mul_ps(z1,h4)), _mm256_FMA_ps(y1, h3, _mm256_mul_ps(x1,h2)))); -// v2 = _mm256_FMSUB_pd(v2, tau5, _mm256_add_pd(_mm256_FMA_pd(w2, h5, _mm256_mul_pd(z2,h4)), _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2)))); -#else - v1 = _mm256_sub_ps(_mm256_mul_ps(v1,tau5), _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(w1,h5), _mm256_mul_ps(z1,h4)), _mm256_add_ps(_mm256_mul_ps(y1,h3), _mm256_mul_ps(x1,h2)))); -// v2 = _mm256_sub_pd(_mm256_mul_pd(v2,tau5), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w2,h5), _mm256_mul_pd(z2,h4)), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2)))); -#endif + __m512 tau3 = _mm512_set1_ps(hh[ldh*2]); + __m512 vs_1_3 = _mm512_set1_ps(scalarprods[1]); + __m512 vs_2_3 = _mm512_set1_ps(scalarprods[2]); + + h2 = _mm512_mul_ps(tau3, vs_1_3); + h3 = _mm512_mul_ps(tau3, vs_2_3); + + z1 = _mm512_FMSUB_ps(z1, tau3, _mm512_FMA_ps(y1, h3, _mm512_mul_ps(x1,h2))); + z2 = _mm512_FMSUB_ps(z2, tau3, _mm512_FMA_ps(y2, h3, _mm512_mul_ps(x2,h2))); + z3 = _mm512_FMSUB_ps(z3, tau3, _mm512_FMA_ps(y3, h3, _mm512_mul_ps(x3,h2))); + z4 = _mm512_FMSUB_ps(z4, tau3, _mm512_FMA_ps(y4, h3, _mm512_mul_ps(x4,h2))); + + __m512 tau4 = _mm512_set1_ps(hh[ldh*3]); + __m512 vs_1_4 = _mm512_set1_ps(scalarprods[3]); + __m512 vs_2_4 = _mm512_set1_ps(scalarprods[4]); + + h2 = _mm512_mul_ps(tau4, vs_1_4); + h3 = _mm512_mul_ps(tau4, vs_2_4); + + __m512 vs_3_4 = _mm512_set1_ps(scalarprods[5]); + h4 = _mm512_mul_ps(tau4, vs_3_4); + + w1 = _mm512_FMSUB_ps(w1, tau4, _mm512_FMA_ps(z1, h4, _mm512_FMA_ps(y1, h3, _mm512_mul_ps(x1,h2)))); + w2 = _mm512_FMSUB_ps(w2, tau4, _mm512_FMA_ps(z2, h4, _mm512_FMA_ps(y2, h3, _mm512_mul_ps(x2,h2)))); + w3 = _mm512_FMSUB_ps(w3, tau4, _mm512_FMA_ps(z3, h4, _mm512_FMA_ps(y3, h3, _mm512_mul_ps(x3,h2)))); + w4 = _mm512_FMSUB_ps(w4, tau4, _mm512_FMA_ps(z4, h4, _mm512_FMA_ps(y4, h3, _mm512_mul_ps(x4,h2)))); + + __m512 tau5 = _mm512_set1_ps(hh[ldh*4]); + __m512 vs_1_5 = _mm512_set1_ps(scalarprods[6]); + __m512 vs_2_5 = _mm512_set1_ps(scalarprods[7]); + + h2 = _mm512_mul_ps(tau5, vs_1_5); + h3 = _mm512_mul_ps(tau5, vs_2_5); + + __m512 vs_3_5 = _mm512_set1_ps(scalarprods[8]); + __m512 vs_4_5 = _mm512_set1_ps(scalarprods[9]); + + h4 = _mm512_mul_ps(tau5, vs_3_5); + h5 = _mm512_mul_ps(tau5, vs_4_5); + + v1 = _mm512_FMSUB_ps(v1, tau5, _mm512_add_ps(_mm512_FMA_ps(w1, h5, _mm512_mul_ps(z1,h4)), _mm512_FMA_ps(y1, h3, _mm512_mul_ps(x1,h2)))); + v2 = _mm512_FMSUB_ps(v2, tau5, _mm512_add_ps(_mm512_FMA_ps(w2, h5, _mm512_mul_ps(z2,h4)), _mm512_FMA_ps(y2, h3, _mm512_mul_ps(x2,h2)))); + v3 = _mm512_FMSUB_ps(v3, tau5, _mm512_add_ps(_mm512_FMA_ps(w3, h5, _mm512_mul_ps(z3,h4)), _mm512_FMA_ps(y3, h3, _mm512_mul_ps(x3,h2)))); + v4 = _mm512_FMSUB_ps(v4, tau5, _mm512_add_ps(_mm512_FMA_ps(w4, h5, _mm512_mul_ps(z4,h4)), _mm512_FMA_ps(y4, h3, _mm512_mul_ps(x4,h2)))); + + __m512 tau6 = _mm512_set1_ps(hh[ldh*5]); + __m512 vs_1_6 = _mm512_set1_ps(scalarprods[10]); + __m512 vs_2_6 = _mm512_set1_ps(scalarprods[11]); + h2 = _mm512_mul_ps(tau6, vs_1_6); + h3 = _mm512_mul_ps(tau6, vs_2_6); + + __m512 vs_3_6 = _mm512_set1_ps(scalarprods[12]); + __m512 vs_4_6 = _mm512_set1_ps(scalarprods[13]); + __m512 vs_5_6 = _mm512_set1_ps(scalarprods[14]); + + h4 = _mm512_mul_ps(tau6, vs_3_6); + h5 = _mm512_mul_ps(tau6, vs_4_6); + h6 = _mm512_mul_ps(tau6, vs_5_6); + + t1 = _mm512_FMSUB_ps(t1, tau6, _mm512_FMA_ps(v1, h6, _mm512_add_ps(_mm512_FMA_ps(w1, h5, _mm512_mul_ps(z1,h4)), _mm512_FMA_ps(y1, h3, _mm512_mul_ps(x1,h2))))); + t2 = _mm512_FMSUB_ps(t2, tau6, _mm512_FMA_ps(v2, h6, _mm512_add_ps(_mm512_FMA_ps(w2, h5, _mm512_mul_ps(z2,h4)), _mm512_FMA_ps(y2, h3, _mm512_mul_ps(x2,h2))))); + t3 = _mm512_FMSUB_ps(t3, tau6, _mm512_FMA_ps(v3, h6, _mm512_add_ps(_mm512_FMA_ps(w3, h5, _mm512_mul_ps(z3,h4)), _mm512_FMA_ps(y3, h3, _mm512_mul_ps(x3,h2))))); + t4 = _mm512_FMSUB_ps(t4, tau6, _mm512_FMA_ps(v4, h6, _mm512_add_ps(_mm512_FMA_ps(w4, h5, _mm512_mul_ps(z4,h4)), _mm512_FMA_ps(y4, h3, _mm512_mul_ps(x4,h2))))); - __m256 tau6 = _mm256_broadcast_ss(&hh[ldh*5]); - __m256 vs_1_6 = _mm256_broadcast_ss(&scalarprods[10]); - __m256 vs_2_6 = _mm256_broadcast_ss(&scalarprods[11]); - h2 = _mm256_mul_ps(tau6, vs_1_6); - h3 = _mm256_mul_ps(tau6, vs_2_6); - __m256 vs_3_6 = _mm256_broadcast_ss(&scalarprods[12]); - __m256 vs_4_6 = _mm256_broadcast_ss(&scalarprods[13]); - __m256 vs_5_6 = _mm256_broadcast_ss(&scalarprods[14]); - h4 = _mm256_mul_ps(tau6, vs_3_6); - h5 = _mm256_mul_ps(tau6, vs_4_6); - h6 = _mm256_mul_ps(tau6, vs_5_6); -#ifdef __ELPA_USE_FMA__ - t1 = _mm256_FMSUB_ps(t1, tau6, _mm256_FMA_ps(v1, h6, _mm256_add_ps(_mm256_FMA_ps(w1, h5, _mm256_mul_ps(z1,h4)), _mm256_FMA_ps(y1, h3, _mm256_mul_ps(x1,h2))))); -// t2 = _mm256_FMSUB_pd(t2, tau6, _mm256_FMA_pd(v2, h6, _mm256_add_pd(_mm256_FMA_pd(w2, h5, _mm256_mul_pd(z2,h4)), _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2))))); -#else - t1 = _mm256_sub_ps(_mm256_mul_ps(t1,tau6), _mm256_add_ps( _mm256_mul_ps(v1,h6), _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(w1,h5), _mm256_mul_ps(z1,h4)), _mm256_add_ps(_mm256_mul_ps(y1,h3), _mm256_mul_ps(x1,h2))))); -// t2 = _mm256_sub_pd(_mm256_mul_pd(t2,tau6), _mm256_add_pd( _mm256_mul_pd(v2,h6), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w2,h5), _mm256_mul_pd(z2,h4)), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2))))); -#endif ///////////////////////////////////////////////////// // Rank-1 update of Q [8 x nb+3] ///////////////////////////////////////////////////// - q1 = _mm256_load_ps(&q[0]); -// q2 = _mm256_load_pd(&q[4]); - q1 = _mm256_sub_ps(q1, t1); -// q2 = _mm256_sub_pd(q2, t2); - _mm256_store_ps(&q[0],q1); -// _mm256_store_pd(&q[4],q2); - - h6 = _mm256_broadcast_ss(&hh[(ldh*5)+1]); - q1 = _mm256_load_ps(&q[ldq]); -// q2 = _mm256_load_pd(&q[(ldq+4)]); - q1 = _mm256_sub_ps(q1, v1); -// q2 = _mm256_sub_pd(q2, v2); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(t1, h6, q1); -// q2 = _mm256_NFMA_pd(t2, h6, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(t1, h6)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6)); -#endif - _mm256_store_ps(&q[ldq],q1); -// _mm256_store_pd(&q[(ldq+4)],q2); - - h5 = _mm256_broadcast_ss(&hh[(ldh*4)+1]); - q1 = _mm256_load_ps(&q[ldq*2]); -// q2 = _mm256_load_pd(&q[(ldq*2)+4]); - q1 = _mm256_sub_ps(q1, w1); -// q2 = _mm256_sub_pd(q2, w2); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(v1, h5, q1); -// q2 = _mm256_NFMA_pd(v2, h5, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(v1, h5)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5)); -#endif - h6 = _mm256_broadcast_ss(&hh[(ldh*5)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(t1, h6, q1); -// q2 = _mm256_NFMA_pd(t2, h6, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(t1, h6)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6)); -#endif - _mm256_store_ps(&q[ldq*2],q1); -// _mm256_store_pd(&q[(ldq*2)+4],q2); - - h4 = _mm256_broadcast_ss(&hh[(ldh*3)+1]); - q1 = _mm256_load_ps(&q[ldq*3]); -// q2 = _mm256_load_pd(&q[(ldq*3)+4]); - q1 = _mm256_sub_ps(q1, z1); -// q2 = _mm256_sub_pd(q2, z2); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(w1, h4, q1); -// q2 = _mm256_NFMA_pd(w2, h4, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(w1, h4)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); -#endif - h5 = _mm256_broadcast_ss(&hh[(ldh*4)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(v1, h5, q1); -// q2 = _mm256_NFMA_pd(v2, h5, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(v1, h5)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5)); -#endif - h6 = _mm256_broadcast_ss(&hh[(ldh*5)+3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(t1, h6, q1); -// q2 = _mm256_NFMA_pd(t2, h6, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(t1, h6)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6)); -#endif - _mm256_store_ps(&q[ldq*3],q1); -// _mm256_store_pd(&q[(ldq*3)+4],q2); - - h3 = _mm256_broadcast_ss(&hh[(ldh*2)+1]); - q1 = _mm256_load_ps(&q[ldq*4]); -// q2 = _mm256_load_pd(&q[(ldq*4)+4]); - q1 = _mm256_sub_ps(q1, y1); -// q2 = _mm256_sub_pd(q2, y2); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(z1, h3, q1); -// q2 = _mm256_NFMA_pd(z2, h3, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(z1, h3)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); -#endif - h4 = _mm256_broadcast_ss(&hh[(ldh*3)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(w1, h4, q1); -// q2 = _mm256_NFMA_pd(w2, h4, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(w1, h4)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); -#endif - h5 = _mm256_broadcast_ss(&hh[(ldh*4)+3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(v1, h5, q1); -// q2 = _mm256_NFMA_pd(v2, h5, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(v1, h5)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5)); -#endif - h6 = _mm256_broadcast_ss(&hh[(ldh*5)+4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(t1, h6, q1); -// q2 = _mm256_NFMA_pd(t2, h6, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(t1, h6)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6)); -#endif - _mm256_store_ps(&q[ldq*4],q1); -// _mm256_store_pd(&q[(ldq*4)+4],q2); - - h2 = _mm256_broadcast_ss(&hh[(ldh)+1]); - q1 = _mm256_load_ps(&q[ldq*5]); -// q2 = _mm256_load_pd(&q[(ldq*5)+4]); - q1 = _mm256_sub_ps(q1, x1); -// q2 = _mm256_sub_pd(q2, x2); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(y1, h2, q1); -// q2 = _mm256_NFMA_pd(y2, h2, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(y1, h2)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2)); -#endif - h3 = _mm256_broadcast_ss(&hh[(ldh*2)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(z1, h3, q1); -// q2 = _mm256_NFMA_pd(z2, h3, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(z1, h3)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); -#endif - h4 = _mm256_broadcast_ss(&hh[(ldh*3)+3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(w1, h4, q1); -// q2 = _mm256_NFMA_pd(w2, h4, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(w1, h4)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); -#endif - h5 = _mm256_broadcast_ss(&hh[(ldh*4)+4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(v1, h5, q1); -// q2 = _mm256_NFMA_pd(v2, h5, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(v1, h5)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5)); -#endif - h6 = _mm256_broadcast_ss(&hh[(ldh*5)+5]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(t1, h6, q1); -// q2 = _mm256_NFMA_pd(t2, h6, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(t1, h6)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6)); -#endif - _mm256_store_ps(&q[ldq*5],q1); -// _mm256_store_pd(&q[(ldq*5)+4],q2); + q1 = _mm512_load_ps(&q[0]); + q2 = _mm512_load_ps(&q[0+16]); + q3 = _mm512_load_ps(&q[0+32]); + q4 = _mm512_load_ps(&q[0+48]); + + q1 = _mm512_sub_ps(q1, t1); + q2 = _mm512_sub_ps(q2, t2); + q3 = _mm512_sub_ps(q3, t3); + q4 = _mm512_sub_ps(q4, t4); + + _mm512_store_ps(&q[0],q1); + _mm512_store_ps(&q[0+16],q2); + _mm512_store_ps(&q[0+32],q3); + _mm512_store_ps(&q[0+48],q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+1]); + q1 = _mm512_load_ps(&q[ldq]); + q2 = _mm512_load_ps(&q[ldq+16]); + q3 = _mm512_load_ps(&q[ldq+32]); + q4 = _mm512_load_ps(&q[ldq+48]); + + q1 = _mm512_sub_ps(q1, v1); + q2 = _mm512_sub_ps(q2, v2); + q3 = _mm512_sub_ps(q3, v3); + q4 = _mm512_sub_ps(q4, v4); + + q1 = _mm512_NFMA_ps(t1, h6, q1); + q2 = _mm512_NFMA_ps(t2, h6, q2); + q3 = _mm512_NFMA_ps(t3, h6, q3); + q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[ldq],q1); + _mm512_store_ps(&q[ldq+16],q2); + _mm512_store_ps(&q[ldq+32],q3); + _mm512_store_ps(&q[ldq+48],q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+1]); + q1 = _mm512_load_ps(&q[ldq*2]); + q2 = _mm512_load_ps(&q[(ldq*2)+16]); + q3 = _mm512_load_ps(&q[(ldq*2)+32]); + q4 = _mm512_load_ps(&q[(ldq*2)+48]); + + q1 = _mm512_sub_ps(q1, w1); + q2 = _mm512_sub_ps(q2, w2); + q3 = _mm512_sub_ps(q3, w3); + q4 = _mm512_sub_ps(q4, w4); + + q1 = _mm512_NFMA_ps(v1, h5, q1); + q2 = _mm512_NFMA_ps(v2, h5, q2); + q3 = _mm512_NFMA_ps(v3, h5, q3); + q4 = _mm512_NFMA_ps(v4, h5, q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+2]); + + q1 = _mm512_NFMA_ps(t1, h6, q1); + q2 = _mm512_NFMA_ps(t2, h6, q2); + q3 = _mm512_NFMA_ps(t3, h6, q3); + q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[ldq*2],q1); + _mm512_store_ps(&q[(ldq*2)+16],q2); + _mm512_store_ps(&q[(ldq*2)+32],q3); + _mm512_store_ps(&q[(ldq*2)+48],q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+1]); + q1 = _mm512_load_ps(&q[ldq*3]); + q2 = _mm512_load_ps(&q[(ldq*3)+16]); + q3 = _mm512_load_ps(&q[(ldq*3)+32]); + q4 = _mm512_load_ps(&q[(ldq*3)+48]); + + q1 = _mm512_sub_ps(q1, z1); + q2 = _mm512_sub_ps(q2, z2); + q3 = _mm512_sub_ps(q3, z3); + q4 = _mm512_sub_ps(q4, z4); + + q1 = _mm512_NFMA_ps(w1, h4, q1); + q2 = _mm512_NFMA_ps(w2, h4, q2); + q3 = _mm512_NFMA_ps(w3, h4, q3); + q4 = _mm512_NFMA_ps(w4, h4, q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+2]); + + q1 = _mm512_NFMA_ps(v1, h5, q1); + q2 = _mm512_NFMA_ps(v2, h5, q2); + q3 = _mm512_NFMA_ps(v3, h5, q3); + q4 = _mm512_NFMA_ps(v4, h5, q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+3]); + + q1 = _mm512_NFMA_ps(t1, h6, q1); + q2 = _mm512_NFMA_ps(t2, h6, q2); + q3 = _mm512_NFMA_ps(t3, h6, q3); + q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[ldq*3],q1); + _mm512_store_ps(&q[(ldq*3)+16],q2); + _mm512_store_ps(&q[(ldq*3)+32],q3); + _mm512_store_ps(&q[(ldq*3)+48],q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+1]); + q1 = _mm512_load_ps(&q[ldq*4]); + q2 = _mm512_load_ps(&q[(ldq*4)+16]); + q3 = _mm512_load_ps(&q[(ldq*4)+32]); + q4 = _mm512_load_ps(&q[(ldq*4)+48]); + + q1 = _mm512_sub_ps(q1, y1); + q2 = _mm512_sub_ps(q2, y2); + q3 = _mm512_sub_ps(q3, y3); + q4 = _mm512_sub_ps(q4, y4); + + q1 = _mm512_NFMA_ps(z1, h3, q1); + q2 = _mm512_NFMA_ps(z2, h3, q2); + q3 = _mm512_NFMA_ps(z3, h3, q3); + q4 = _mm512_NFMA_ps(z4, h3, q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+2]); + + q1 = _mm512_NFMA_ps(w1, h4, q1); + q2 = _mm512_NFMA_ps(w2, h4, q2); + q3 = _mm512_NFMA_ps(w3, h4, q3); + q4 = _mm512_NFMA_ps(w4, h4, q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+3]); + + q1 = _mm512_NFMA_ps(v1, h5, q1); + q2 = _mm512_NFMA_ps(v2, h5, q2); + q3 = _mm512_NFMA_ps(v3, h5, q3); + q4 = _mm512_NFMA_ps(v4, h5, q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+4]); + + q1 = _mm512_NFMA_ps(t1, h6, q1); + q2 = _mm512_NFMA_ps(t2, h6, q2); + q3 = _mm512_NFMA_ps(t3, h6, q3); + q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[ldq*4],q1); + _mm512_store_ps(&q[(ldq*4)+16],q2); + _mm512_store_ps(&q[(ldq*4)+32],q3); + _mm512_store_ps(&q[(ldq*4)+48],q4); + + h2 = _mm512_set1_ps(hh[(ldh)+1]); + q1 = _mm512_load_ps(&q[ldq*5]); + q2 = _mm512_load_ps(&q[(ldq*5)+16]); + q3 = _mm512_load_ps(&q[(ldq*5)+32]); + q4 = _mm512_load_ps(&q[(ldq*5)+48]); + + q1 = _mm512_sub_ps(q1, x1); + q2 = _mm512_sub_ps(q2, x2); + q3 = _mm512_sub_ps(q3, x3); + q4 = _mm512_sub_ps(q4, x4); + + q1 = _mm512_NFMA_ps(y1, h2, q1); + q2 = _mm512_NFMA_ps(y2, h2, q2); + q3 = _mm512_NFMA_ps(y3, h2, q3); + q4 = _mm512_NFMA_ps(y4, h2, q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+2]); + + q1 = _mm512_NFMA_ps(z1, h3, q1); + q2 = _mm512_NFMA_ps(z2, h3, q2); + q3 = _mm512_NFMA_ps(z3, h3, q3); + q4 = _mm512_NFMA_ps(z4, h3, q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+3]); + + q1 = _mm512_NFMA_ps(w1, h4, q1); + q2 = _mm512_NFMA_ps(w2, h4, q2); + q3 = _mm512_NFMA_ps(w3, h4, q3); + q4 = _mm512_NFMA_ps(w4, h4, q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+4]); + + q1 = _mm512_NFMA_ps(v1, h5, q1); + q2 = _mm512_NFMA_ps(v2, h5, q2); + q3 = _mm512_NFMA_ps(v3, h5, q3); + q4 = _mm512_NFMA_ps(v4, h5, q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+5]); + + q1 = _mm512_NFMA_ps(t1, h6, q1); + q2 = _mm512_NFMA_ps(t2, h6, q2); + q3 = _mm512_NFMA_ps(t3, h6, q3); + q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[ldq*5],q1); + _mm512_store_ps(&q[(ldq*5)+16],q2); + _mm512_store_ps(&q[(ldq*5)+32],q3); + _mm512_store_ps(&q[(ldq*5)+48],q4); for (i = 6; i < nb; i++) { - q1 = _mm256_load_ps(&q[i*ldq]); -// q2 = _mm256_load_pd(&q[(i*ldq)+4]); - h1 = _mm256_broadcast_ss(&hh[i-5]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(x1, h1, q1); -// q2 = _mm256_NFMA_pd(x2, h1, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(x1, h1)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1)); -#endif - h2 = _mm256_broadcast_ss(&hh[ldh+i-4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(y1, h2, q1); -// q2 = _mm256_NFMA_pd(y2, h2, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(y1, h2)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2)); -#endif - h3 = _mm256_broadcast_ss(&hh[(ldh*2)+i-3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(z1, h3, q1); -// q2 = _mm256_NFMA_pd(z2, h3, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(z1, h3)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); -#endif - h4 = _mm256_broadcast_ss(&hh[(ldh*3)+i-2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(w1, h4, q1); -// q2 = _mm256_NFMA_pd(w2, h4, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(w1, h4)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); -#endif - h5 = _mm256_broadcast_ss(&hh[(ldh*4)+i-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(v1, h5, q1); -// q2 = _mm256_NFMA_pd(v2, h5, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(v1, h5)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5)); -#endif - h6 = _mm256_broadcast_ss(&hh[(ldh*5)+i]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(t1, h6, q1); -// q2 = _mm256_NFMA_pd(t2, h6, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(t1, h6)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6)); -#endif - _mm256_store_ps(&q[i*ldq],q1); -// _mm256_store_pd(&q[(i*ldq)+4],q2); + q1 = _mm512_load_ps(&q[i*ldq]); + q2 = _mm512_load_ps(&q[(i*ldq)+16]); + q3 = _mm512_load_ps(&q[(i*ldq)+32]); + q4 = _mm512_load_ps(&q[(i*ldq)+48]); + + h1 = _mm512_set1_ps(hh[i-5]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); + q2 = _mm512_NFMA_ps(x2, h1, q2); + q3 = _mm512_NFMA_ps(x3, h1, q3); + q4 = _mm512_NFMA_ps(x4, h1, q4); + + h2 = _mm512_set1_ps(hh[ldh+i-4]); + + q1 = _mm512_NFMA_ps(y1, h2, q1); + q2 = _mm512_NFMA_ps(y2, h2, q2); + q3 = _mm512_NFMA_ps(y3, h2, q3); + q4 = _mm512_NFMA_ps(y4, h2, q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+i-3]); + + q1 = _mm512_NFMA_ps(z1, h3, q1); + q2 = _mm512_NFMA_ps(z2, h3, q2); + q3 = _mm512_NFMA_ps(z3, h3, q3); + q4 = _mm512_NFMA_ps(z4, h3, q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+i-2]); + + q1 = _mm512_NFMA_ps(w1, h4, q1); + q2 = _mm512_NFMA_ps(w2, h4, q2); + q3 = _mm512_NFMA_ps(w3, h4, q3); + q4 = _mm512_NFMA_ps(w4, h4, q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+i-1]); + + q1 = _mm512_NFMA_ps(v1, h5, q1); + q2 = _mm512_NFMA_ps(v2, h5, q2); + q3 = _mm512_NFMA_ps(v3, h5, q3); + q4 = _mm512_NFMA_ps(v4, h5, q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+i]); + + q1 = _mm512_NFMA_ps(t1, h6, q1); + q2 = _mm512_NFMA_ps(t2, h6, q2); + q3 = _mm512_NFMA_ps(t3, h6, q3); + q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[i*ldq],q1); + _mm512_store_ps(&q[(i*ldq)+16],q2); + _mm512_store_ps(&q[(i*ldq)+32],q3); + _mm512_store_ps(&q[(i*ldq)+48],q4); + } - h1 = _mm256_broadcast_ss(&hh[nb-5]); - q1 = _mm256_load_ps(&q[nb*ldq]); -// q2 = _mm256_load_pd(&q[(nb*ldq)+4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(x1, h1, q1); -// q2 = _mm256_NFMA_pd(x2, h1, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(x1, h1)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1)); -#endif - h2 = _mm256_broadcast_ss(&hh[ldh+nb-4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(y1, h2, q1); -// q2 = _mm256_NFMA_pd(y2, h2, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(y1, h2)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2)); -#endif - h3 = _mm256_broadcast_ss(&hh[(ldh*2)+nb-3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(z1, h3, q1); -// q2 = _mm256_NFMA_pd(z2, h3, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(z1, h3)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); -#endif - h4 = _mm256_broadcast_ss(&hh[(ldh*3)+nb-2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(w1, h4, q1); -// q2 = _mm256_NFMA_pd(w2, h4, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(w1, h4)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); -#endif - h5 = _mm256_broadcast_ss(&hh[(ldh*4)+nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(v1, h5, q1); -// q2 = _mm256_NFMA_pd(v2, h5, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(v1, h5)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5)); -#endif - _mm256_store_ps(&q[nb*ldq],q1); -// _mm256_store_pd(&q[(nb*ldq)+4],q2); - - h1 = _mm256_broadcast_ss(&hh[nb-4]); - q1 = _mm256_load_ps(&q[(nb+1)*ldq]); -// q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(x1, h1, q1); -// q2 = _mm256_NFMA_pd(x2, h1, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(x1, h1)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1)); -#endif - h2 = _mm256_broadcast_ss(&hh[ldh+nb-3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(y1, h2, q1); -// q2 = _mm256_NFMA_pd(y2, h2, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(y1, h2)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2)); -#endif - h3 = _mm256_broadcast_ss(&hh[(ldh*2)+nb-2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(z1, h3, q1); -// q2 = _mm256_NFMA_pd(z2, h3, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(z1, h3)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); -#endif - h4 = _mm256_broadcast_ss(&hh[(ldh*3)+nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(w1, h4, q1); -// q2 = _mm256_NFMA_pd(w2, h4, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(w1, h4)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); -#endif - _mm256_store_ps(&q[(nb+1)*ldq],q1); -// _mm256_store_pd(&q[((nb+1)*ldq)+4],q2); - - h1 = _mm256_broadcast_ss(&hh[nb-3]); - q1 = _mm256_load_ps(&q[(nb+2)*ldq]); -// q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(x1, h1, q1); -// q2 = _mm256_NFMA_pd(x2, h1, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(x1, h1)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1)); -#endif - h2 = _mm256_broadcast_ss(&hh[ldh+nb-2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(y1, h2, q1); -// q2 = _mm256_NFMA_pd(y2, h2, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(y1, h2)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2)); -#endif - h3 = _mm256_broadcast_ss(&hh[(ldh*2)+nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(z1, h3, q1); -// q2 = _mm256_NFMA_pd(z2, h3, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(z1, h3)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); -#endif - _mm256_store_ps(&q[(nb+2)*ldq],q1); -// _mm256_store_pd(&q[((nb+2)*ldq)+4],q2); - - h1 = _mm256_broadcast_ss(&hh[nb-2]); - q1 = _mm256_load_ps(&q[(nb+3)*ldq]); -// q2 = _mm256_load_pd(&q[((nb+3)*ldq)+4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(x1, h1, q1); -// q2 = _mm256_NFMA_pd(x2, h1, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(x1, h1)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1)); -#endif - h2 = _mm256_broadcast_ss(&hh[ldh+nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(y1, h2, q1); -// q2 = _mm256_NFMA_pd(y2, h2, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(y1, h2)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2)); -#endif - _mm256_store_ps(&q[(nb+3)*ldq],q1); -// _mm256_store_pd(&q[((nb+3)*ldq)+4],q2); - - h1 = _mm256_broadcast_ss(&hh[nb-1]); - q1 = _mm256_load_ps(&q[(nb+4)*ldq]); -// q2 = _mm256_load_pd(&q[((nb+4)*ldq)+4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(x1, h1, q1); -// q2 = _mm256_NFMA_pd(x2, h1, q2); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(x1, h1)); -// q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1)); -#endif - _mm256_store_ps(&q[(nb+4)*ldq],q1); -// _mm256_store_pd(&q[((nb+4)*ldq)+4],q2); + h1 = _mm512_set1_ps(hh[nb-5]); + q1 = _mm512_load_ps(&q[nb*ldq]); + q2 = _mm512_load_ps(&q[(nb*ldq)+16]); + q3 = _mm512_load_ps(&q[(nb*ldq)+32]); + q4 = _mm512_load_ps(&q[(nb*ldq)+48]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); + q2 = _mm512_NFMA_ps(x2, h1, q2); + q3 = _mm512_NFMA_ps(x3, h1, q3); + q4 = _mm512_NFMA_ps(x4, h1, q4); + + h2 = _mm512_set1_ps(hh[ldh+nb-4]); + + q1 = _mm512_NFMA_ps(y1, h2, q1); + q2 = _mm512_NFMA_ps(y2, h2, q2); + q3 = _mm512_NFMA_ps(y3, h2, q3); + q4 = _mm512_NFMA_ps(y4, h2, q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-3]); + + q1 = _mm512_NFMA_ps(z1, h3, q1); + q2 = _mm512_NFMA_ps(z2, h3, q2); + q3 = _mm512_NFMA_ps(z3, h3, q3); + q4 = _mm512_NFMA_ps(z4, h3, q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+nb-2]); + + q1 = _mm512_NFMA_ps(w1, h4, q1); + q2 = _mm512_NFMA_ps(w2, h4, q2); + q3 = _mm512_NFMA_ps(w3, h4, q3); + q4 = _mm512_NFMA_ps(w4, h4, q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+nb-1]); + + q1 = _mm512_NFMA_ps(v1, h5, q1); + q2 = _mm512_NFMA_ps(v2, h5, q2); + q3 = _mm512_NFMA_ps(v3, h5, q3); + q4 = _mm512_NFMA_ps(v4, h5, q4); + + _mm512_store_ps(&q[nb*ldq],q1); + _mm512_store_ps(&q[(nb*ldq)+16],q2); + _mm512_store_ps(&q[(nb*ldq)+32],q3); + _mm512_store_ps(&q[(nb*ldq)+48],q4); + + h1 = _mm512_set1_ps(hh[nb-4]); + q1 = _mm512_load_ps(&q[(nb+1)*ldq]); + q2 = _mm512_load_ps(&q[((nb+1)*ldq)+16]); + q3 = _mm512_load_ps(&q[((nb+1)*ldq)+32]); + q4 = _mm512_load_ps(&q[((nb+1)*ldq)+48]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); + q2 = _mm512_NFMA_ps(x2, h1, q2); + q3 = _mm512_NFMA_ps(x3, h1, q3); + q4 = _mm512_NFMA_ps(x4, h1, q4); + + h2 = _mm512_set1_ps(hh[ldh+nb-3]); + + q1 = _mm512_NFMA_ps(y1, h2, q1); + q2 = _mm512_NFMA_ps(y2, h2, q2); + q3 = _mm512_NFMA_ps(y3, h2, q3); + q4 = _mm512_NFMA_ps(y4, h2, q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-2]); + + q1 = _mm512_NFMA_ps(z1, h3, q1); + q2 = _mm512_NFMA_ps(z2, h3, q2); + q3 = _mm512_NFMA_ps(z3, h3, q3); + q4 = _mm512_NFMA_ps(z4, h3, q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+nb-1]); + + q1 = _mm512_NFMA_ps(w1, h4, q1); + q2 = _mm512_NFMA_ps(w2, h4, q2); + q3 = _mm512_NFMA_ps(w3, h4, q3); + q4 = _mm512_NFMA_ps(w4, h4, q4); + + _mm512_store_ps(&q[(nb+1)*ldq],q1); + _mm512_store_ps(&q[((nb+1)*ldq)+16],q2); + _mm512_store_ps(&q[((nb+1)*ldq)+32],q3); + _mm512_store_ps(&q[((nb+1)*ldq)+48],q4); + + h1 = _mm512_set1_ps(hh[nb-3]); + q1 = _mm512_load_ps(&q[(nb+2)*ldq]); + q2 = _mm512_load_ps(&q[((nb+2)*ldq)+16]); + q3 = _mm512_load_ps(&q[((nb+2)*ldq)+32]); + q4 = _mm512_load_ps(&q[((nb+2)*ldq)+48]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); + q2 = _mm512_NFMA_ps(x2, h1, q2); + q3 = _mm512_NFMA_ps(x3, h1, q3); + q4 = _mm512_NFMA_ps(x4, h1, q4); + + h2 = _mm512_set1_ps(hh[ldh+nb-2]); + + q1 = _mm512_NFMA_ps(y1, h2, q1); + q2 = _mm512_NFMA_ps(y2, h2, q2); + q3 = _mm512_NFMA_ps(y3, h2, q3); + q4 = _mm512_NFMA_ps(y4, h2, q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-1]); + + q1 = _mm512_NFMA_ps(z1, h3, q1); + q2 = _mm512_NFMA_ps(z2, h3, q2); + q3 = _mm512_NFMA_ps(z3, h3, q3); + q4 = _mm512_NFMA_ps(z4, h3, q4); + + _mm512_store_ps(&q[(nb+2)*ldq],q1); + _mm512_store_ps(&q[((nb+2)*ldq)+16],q2); + _mm512_store_ps(&q[((nb+2)*ldq)+32],q3); + _mm512_store_ps(&q[((nb+2)*ldq)+48],q4); + + h1 = _mm512_set1_ps(hh[nb-2]); + q1 = _mm512_load_ps(&q[(nb+3)*ldq]); + q2 = _mm512_load_ps(&q[((nb+3)*ldq)+16]); + q3 = _mm512_load_ps(&q[((nb+3)*ldq)+32]); + q4 = _mm512_load_ps(&q[((nb+3)*ldq)+48]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); + q2 = _mm512_NFMA_ps(x2, h1, q2); + q3 = _mm512_NFMA_ps(x3, h1, q3); + q4 = _mm512_NFMA_ps(x4, h1, q4); + + h2 = _mm512_set1_ps(hh[ldh+nb-1]); + + q1 = _mm512_NFMA_ps(y1, h2, q1); + q2 = _mm512_NFMA_ps(y2, h2, q2); + q3 = _mm512_NFMA_ps(y3, h2, q3); + q4 = _mm512_NFMA_ps(y4, h2, q4); + + _mm512_store_ps(&q[(nb+3)*ldq],q1); + _mm512_store_ps(&q[((nb+3)*ldq)+16],q2); + _mm512_store_ps(&q[((nb+3)*ldq)+32],q3); + _mm512_store_ps(&q[((nb+3)*ldq)+48],q4); + + h1 = _mm512_set1_ps(hh[nb-1]); + q1 = _mm512_load_ps(&q[(nb+4)*ldq]); + q2 = _mm512_load_ps(&q[((nb+4)*ldq)+16]); + q3 = _mm512_load_ps(&q[((nb+4)*ldq)+32]); + q4 = _mm512_load_ps(&q[((nb+4)*ldq)+48]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); + q2 = _mm512_NFMA_ps(x2, h1, q2); + q3 = _mm512_NFMA_ps(x3, h1, q3); + q4 = _mm512_NFMA_ps(x4, h1, q4); + + _mm512_store_ps(&q[(nb+4)*ldq],q1); + _mm512_store_ps(&q[((nb+4)*ldq)+16],q2); + _mm512_store_ps(&q[((nb+4)*ldq)+32],q3); + _mm512_store_ps(&q[((nb+4)*ldq)+48],q4); + } + + /** * Unrolled kernel that computes - * 4 rows of Q simultaneously, a + * 48 rows of Q simultaneously, a * matrix vector product with two householder * vectors + a rank 1 update is performed */ -__forceinline void hh_trafo_kernel_4_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods) +__forceinline void hh_trafo_kernel_48_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [8 x nb+3] * hh @@ -1013,559 +1101,2583 @@ __forceinline void hh_trafo_kernel_4_AVX512_6hv_single(float* q, float* hh, int ///////////////////////////////////////////////////// int i; - __m256 a1_1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*5])); - __m256 a2_1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*4])); - __m256 a3_1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*3])); - __m256 a4_1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*2])); - __m256 a5_1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq])); - // we just want to load 4 floats not 8 - __m256 a6_1 = _mm256_castps128_ps256(_mm_load_ps(&q[0])); // q(1,1) | q(2,1) | q(3,1) | q(4,1) -// __m256 a5_1 = _mm256_load_ps(&q[0]); - - __m256 h_6_5 = _mm256_broadcast_ss(&hh[(ldh*5)+1]); - __m256 h_6_4 = _mm256_broadcast_ss(&hh[(ldh*5)+2]); - __m256 h_6_3 = _mm256_broadcast_ss(&hh[(ldh*5)+3]); - __m256 h_6_2 = _mm256_broadcast_ss(&hh[(ldh*5)+4]); - __m256 h_6_1 = _mm256_broadcast_ss(&hh[(ldh*5)+5]); -#ifdef __ELPA_USE_FMA__ - register __m256 t1 = _mm256_FMA_ps(a5_1, h_6_5, a6_1); - t1 = _mm256_FMA_ps(a4_1, h_6_4, t1); - t1 = _mm256_FMA_ps(a3_1, h_6_3, t1); - t1 = _mm256_FMA_ps(a2_1, h_6_2, t1); - t1 = _mm256_FMA_ps(a1_1, h_6_1, t1); -#else - register __m256 t1 = _mm256_add_ps(a6_1, _mm256_mul_ps(a5_1, h_6_5)); - t1 = _mm256_add_ps(t1, _mm256_mul_ps(a4_1, h_6_4)); - t1 = _mm256_add_ps(t1, _mm256_mul_ps(a3_1, h_6_3)); - t1 = _mm256_add_ps(t1, _mm256_mul_ps(a2_1, h_6_2)); - t1 = _mm256_add_ps(t1, _mm256_mul_ps(a1_1, h_6_1)); -#endif - __m256 h_5_4 = _mm256_broadcast_ss(&hh[(ldh*4)+1]); - __m256 h_5_3 = _mm256_broadcast_ss(&hh[(ldh*4)+2]); - __m256 h_5_2 = _mm256_broadcast_ss(&hh[(ldh*4)+3]); - __m256 h_5_1 = _mm256_broadcast_ss(&hh[(ldh*4)+4]); -#ifdef __ELPA_USE_FMA__ - register __m256 v1 = _mm256_FMA_ps(a4_1, h_5_4, a5_1); - v1 = _mm256_FMA_ps(a3_1, h_5_3, v1); - v1 = _mm256_FMA_ps(a2_1, h_5_2, v1); - v1 = _mm256_FMA_ps(a1_1, h_5_1, v1); -#else - register __m256 v1 = _mm256_add_ps(a5_1, _mm256_mul_ps(a4_1, h_5_4)); - v1 = _mm256_add_ps(v1, _mm256_mul_ps(a3_1, h_5_3)); - v1 = _mm256_add_ps(v1, _mm256_mul_ps(a2_1, h_5_2)); - v1 = _mm256_add_ps(v1, _mm256_mul_ps(a1_1, h_5_1)); -#endif - __m256 h_4_3 = _mm256_broadcast_ss(&hh[(ldh*3)+1]); - __m256 h_4_2 = _mm256_broadcast_ss(&hh[(ldh*3)+2]); - __m256 h_4_1 = _mm256_broadcast_ss(&hh[(ldh*3)+3]); -#ifdef __ELPA_USE_FMA__ - register __m256 w1 = _mm256_FMA_ps(a3_1, h_4_3, a4_1); - w1 = _mm256_FMA_ps(a2_1, h_4_2, w1); - w1 = _mm256_FMA_ps(a1_1, h_4_1, w1); -#else - register __m256 w1 = _mm256_add_ps(a4_1, _mm256_mul_ps(a3_1, h_4_3)); - w1 = _mm256_add_ps(w1, _mm256_mul_ps(a2_1, h_4_2)); - w1 = _mm256_add_ps(w1, _mm256_mul_ps(a1_1, h_4_1)); -#endif - __m256 h_2_1 = _mm256_broadcast_ss(&hh[ldh+1]); - __m256 h_3_2 = _mm256_broadcast_ss(&hh[(ldh*2)+1]); - __m256 h_3_1 = _mm256_broadcast_ss(&hh[(ldh*2)+2]); -#ifdef __ELPA_USE_FMA__ - register __m256 z1 = _mm256_FMA_ps(a2_1, h_3_2, a3_1); - z1 = _mm256_FMA_ps(a1_1, h_3_1, z1); - register __m256 y1 = _mm256_FMA_ps(a1_1, h_2_1, a2_1); -#else - register __m256 z1 = _mm256_add_ps(a3_1, _mm256_mul_ps(a2_1, h_3_2)); - z1 = _mm256_add_ps(z1, _mm256_mul_ps(a1_1, h_3_1)); - register __m256 y1 = _mm256_add_ps(a2_1, _mm256_mul_ps(a1_1, h_2_1)); -#endif - register __m256 x1 = a1_1; + __m512 a1_1 = _mm512_load_ps(&q[ldq*5]); + __m512 a2_1 = _mm512_load_ps(&q[ldq*4]); + __m512 a3_1 = _mm512_load_ps(&q[ldq*3]); + __m512 a4_1 = _mm512_load_ps(&q[ldq*2]); + __m512 a5_1 = _mm512_load_ps(&q[ldq]); + __m512 a6_1 = _mm512_load_ps(&q[0]); - __m256 q1; + __m512 h_6_5 = _mm512_set1_ps(hh[(ldh*5)+1]); + __m512 h_6_4 = _mm512_set1_ps(hh[(ldh*5)+2]); + __m512 h_6_3 = _mm512_set1_ps(hh[(ldh*5)+3]); + __m512 h_6_2 = _mm512_set1_ps(hh[(ldh*5)+4]); + __m512 h_6_1 = _mm512_set1_ps(hh[(ldh*5)+5]); - __m256 h1; - __m256 h2; - __m256 h3; - __m256 h4; - __m256 h5; - __m256 h6; +// register __m512d t1 = _mm512_FMA_ps(a5_1, h_6_5, a6_1); + __m512 t1 = _mm512_FMA_ps(a5_1, h_6_5, a6_1); - for(i = 6; i < nb; i++) - { - h1 = _mm256_broadcast_ss(&hh[i-5]); - q1 = _mm256_castps128_ps256(_mm_load_ps(&q[i*ldq])); // q(1,i) | q(2,i) ... | q(4,i) -#ifdef __ELPA_USE_FMA__ - x1 = _mm256_FMA_ps(q1, h1, x1); -#else - x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1)); -#endif - h2 = _mm256_broadcast_ss(&hh[ldh+i-4]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm256_FMA_ps(q1, h2, y1); -#else - y1 = _mm256_add_ps(y1, _mm256_mul_ps(q1,h2)); -#endif - h3 = _mm256_broadcast_ss(&hh[(ldh*2)+i-3]); -#ifdef __ELPA_USE_FMA__ - z1 = _mm256_FMA_ps(q1, h3, z1); -#else - z1 = _mm256_add_ps(z1, _mm256_mul_ps(q1,h3)); -#endif - h4 = _mm256_broadcast_ss(&hh[(ldh*3)+i-2]); -#ifdef __ELPA_USE_FMA__ - w1 = _mm256_FMA_ps(q1, h4, w1); -#else - w1 = _mm256_add_ps(w1, _mm256_mul_ps(q1,h4)); -#endif - h5 = _mm256_broadcast_ss(&hh[(ldh*4)+i-1]); -#ifdef __ELPA_USE_FMA__ - v1 = _mm256_FMA_ps(q1, h5, v1); -#else - v1 = _mm256_add_ps(v1, _mm256_mul_ps(q1,h5)); -#endif - h6 = _mm256_broadcast_ss(&hh[(ldh*5)+i]); -#ifdef __ELPA_USE_FMA__ - t1 = _mm256_FMA_ps(q1, h6, t1); -#else - t1 = _mm256_add_ps(t1, _mm256_mul_ps(q1,h6)); -#endif - } + t1 = _mm512_FMA_ps(a4_1, h_6_4, t1); + t1 = _mm512_FMA_ps(a3_1, h_6_3, t1); + t1 = _mm512_FMA_ps(a2_1, h_6_2, t1); + t1 = _mm512_FMA_ps(a1_1, h_6_1, t1); - h1 = _mm256_broadcast_ss(&hh[nb-5]); - q1 = _mm256_castps128_ps256(_mm_load_ps(&q[nb*ldq])); -#ifdef __ELPA_USE_FMA__ - x1 = _mm256_FMA_ps(q1, h1, x1); -#else - x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1)); -#endif - h2 = _mm256_broadcast_ss(&hh[ldh+nb-4]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm256_FMA_ps(q1, h2, y1); -#else - y1 = _mm256_add_ps(y1, _mm256_mul_ps(q1,h2)); -#endif - h3 = _mm256_broadcast_ss(&hh[(ldh*2)+nb-3]); -#ifdef __ELPA_USE_FMA__ - z1 = _mm256_FMA_ps(q1, h3, z1); -#else - z1 = _mm256_add_ps(z1, _mm256_mul_ps(q1,h3)); -#endif - h4 = _mm256_broadcast_ss(&hh[(ldh*3)+nb-2]); -#ifdef __ELPA_USE_FMA__ - w1 = _mm256_FMA_ps(q1, h4, w1); -#else - w1 = _mm256_add_ps(w1, _mm256_mul_ps(q1,h4)); -#endif - h5 = _mm256_broadcast_ss(&hh[(ldh*4)+nb-1]); -#ifdef __ELPA_USE_FMA__ - v1 = _mm256_FMA_ps(q1, h5, v1); -#else - v1 = _mm256_add_ps(v1, _mm256_mul_ps(q1,h5)); -#endif + __m512 h_5_4 = _mm512_set1_ps(hh[(ldh*4)+1]); + __m512 h_5_3 = _mm512_set1_ps(hh[(ldh*4)+2]); + __m512 h_5_2 = _mm512_set1_ps(hh[(ldh*4)+3]); + __m512 h_5_1 = _mm512_set1_ps(hh[(ldh*4)+4]); - h1 = _mm256_broadcast_ss(&hh[nb-4]); - q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+1)*ldq])); -#ifdef __ELPA_USE_FMA__ - x1 = _mm256_FMA_ps(q1, h1, x1); -#else - x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1)); -#endif - h2 = _mm256_broadcast_ss(&hh[ldh+nb-3]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm256_FMA_ps(q1, h2, y1); -#else - y1 = _mm256_add_ps(y1, _mm256_mul_ps(q1,h2)); -#endif - h3 = _mm256_broadcast_ss(&hh[(ldh*2)+nb-2]); -#ifdef __ELPA_USE_FMA__ - z1 = _mm256_FMA_ps(q1, h3, z1); -#else - z1 = _mm256_add_ps(z1, _mm256_mul_ps(q1,h3)); -#endif - h4 = _mm256_broadcast_ss(&hh[(ldh*3)+nb-1]); -#ifdef __ELPA_USE_FMA__ - w1 = _mm256_FMA_ps(q1, h4, w1); -#else - w1 = _mm256_add_ps(w1, _mm256_mul_ps(q1,h4)); -#endif +// register __m512d v1 = _mm512_FMA_ps(a4_1, h_5_4, a5_1); + __m512 v1 = _mm512_FMA_ps(a4_1, h_5_4, a5_1); - h1 = _mm256_broadcast_ss(&hh[nb-3]); - q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+2)*ldq])); -#ifdef __ELPA_USE_FMA__ - x1 = _mm256_FMA_ps(q1, h1, x1); -#else - x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1)); -#endif - h2 = _mm256_broadcast_ss(&hh[ldh+nb-2]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm256_FMA_ps(q1, h2, y1); -#else - y1 = _mm256_add_ps(y1, _mm256_mul_ps(q1,h2)); -#endif - h3 = _mm256_broadcast_ss(&hh[(ldh*2)+nb-1]); -#ifdef __ELPA_USE_FMA__ - z1 = _mm256_FMA_ps(q1, h3, z1); -#else - z1 = _mm256_add_ps(z1, _mm256_mul_ps(q1,h3)); -#endif + v1 = _mm512_FMA_ps(a3_1, h_5_3, v1); + v1 = _mm512_FMA_ps(a2_1, h_5_2, v1); + v1 = _mm512_FMA_ps(a1_1, h_5_1, v1); - h1 = _mm256_broadcast_ss(&hh[nb-2]); - q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+3)*ldq])); -#ifdef __ELPA_USE_FMA__ - x1 = _mm256_FMA_ps(q1, h1, x1); -#else - x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1)); -#endif - h2 = _mm256_broadcast_ss(&hh[ldh+nb-1]); -#ifdef __ELPA_USE_FMA__ - y1 = _mm256_FMA_ps(q1, h2, y1); -#else - y1 = _mm256_add_ps(y1, _mm256_mul_ps(q1,h2)); -#endif + __m512 h_4_3 = _mm512_set1_ps(hh[(ldh*3)+1]); + __m512 h_4_2 = _mm512_set1_ps(hh[(ldh*3)+2]); + __m512 h_4_1 = _mm512_set1_ps(hh[(ldh*3)+3]); - h1 = _mm256_broadcast_ss(&hh[nb-1]); - q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+4)*ldq])); -#ifdef __ELPA_USE_FMA__ - x1 = _mm256_FMA_ps(q1, h1, x1); -#else - x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1)); -#endif +// register __m512d w1 = _mm512_FMA_ps(a3_1, h_4_3, a4_1); + __m512 w1 = _mm512_FMA_ps(a3_1, h_4_3, a4_1); - ///////////////////////////////////////////////////// - // Apply tau, correct wrong calculation using pre-calculated scalar products - ///////////////////////////////////////////////////// + w1 = _mm512_FMA_ps(a2_1, h_4_2, w1); + w1 = _mm512_FMA_ps(a1_1, h_4_1, w1); - __m256 tau1 = _mm256_broadcast_ss(&hh[0]); - x1 = _mm256_mul_ps(x1, tau1); + __m512 h_2_1 = _mm512_set1_ps(hh[ldh+1]); + __m512 h_3_2 = _mm512_set1_ps(hh[(ldh*2)+1]); + __m512 h_3_1 = _mm512_set1_ps(hh[(ldh*2)+2]); - __m256 tau2 = _mm256_broadcast_ss(&hh[ldh]); - __m256 vs_1_2 = _mm256_broadcast_ss(&scalarprods[0]); - h2 = _mm256_mul_ps(tau2, vs_1_2); -#ifdef __ELPA_USE_FMA__ - y1 = _mm256_FMSUB_ps(y1, tau2, _mm256_mul_ps(x1,h2)); -#else - y1 = _mm256_sub_ps(_mm256_mul_ps(y1,tau2), _mm256_mul_ps(x1,h2)); -#endif +// register __m512d z1 = _mm512_FMA_ps(a2_1, h_3_2, a3_1); + __m512 z1 = _mm512_FMA_ps(a2_1, h_3_2, a3_1); - __m256 tau3 = _mm256_broadcast_ss(&hh[ldh*2]); - __m256 vs_1_3 = _mm256_broadcast_ss(&scalarprods[1]); - __m256 vs_2_3 = _mm256_broadcast_ss(&scalarprods[2]); - h2 = _mm256_mul_ps(tau3, vs_1_3); - h3 = _mm256_mul_ps(tau3, vs_2_3); -#ifdef __ELPA_USE_FMA__ - z1 = _mm256_FMSUB_ps(z1, tau3, _mm256_FMA_ps(y1, h3, _mm256_mul_ps(x1,h2))); -#else - z1 = _mm256_sub_ps(_mm256_mul_ps(z1,tau3), _mm256_add_ps(_mm256_mul_ps(y1,h3), _mm256_mul_ps(x1,h2))); -#endif + z1 = _mm512_FMA_ps(a1_1, h_3_1, z1); +// register __m512d y1 = _mm512_FMA_ps(a1_1, h_2_1, a2_1); + __m512 y1 = _mm512_FMA_ps(a1_1, h_2_1, a2_1); - __m256 tau4 = _mm256_broadcast_ss(&hh[ldh*3]); - __m256 vs_1_4 = _mm256_broadcast_ss(&scalarprods[3]); - __m256 vs_2_4 = _mm256_broadcast_ss(&scalarprods[4]); - h2 = _mm256_mul_ps(tau4, vs_1_4); - h3 = _mm256_mul_ps(tau4, vs_2_4); - __m256 vs_3_4 = _mm256_broadcast_ss(&scalarprods[5]); - h4 = _mm256_mul_ps(tau4, vs_3_4); -#ifdef __ELPA_USE_FMA__ - w1 = _mm256_FMSUB_ps(w1, tau4, _mm256_FMA_ps(z1, h4, _mm256_FMA_ps(y1, h3, _mm256_mul_ps(x1,h2)))); -#else - w1 = _mm256_sub_ps(_mm256_mul_ps(w1,tau4), _mm256_add_ps(_mm256_mul_ps(z1,h4), _mm256_add_ps(_mm256_mul_ps(y1,h3), _mm256_mul_ps(x1,h2)))); -#endif - __m256 tau5 = _mm256_broadcast_ss(&hh[ldh*4]); - __m256 vs_1_5 = _mm256_broadcast_ss(&scalarprods[6]); - __m256 vs_2_5 = _mm256_broadcast_ss(&scalarprods[7]); - h2 = _mm256_mul_ps(tau5, vs_1_5); - h3 = _mm256_mul_ps(tau5, vs_2_5); - __m256 vs_3_5 = _mm256_broadcast_ss(&scalarprods[8]); - __m256 vs_4_5 = _mm256_broadcast_ss(&scalarprods[9]); - h4 = _mm256_mul_ps(tau5, vs_3_5); - h5 = _mm256_mul_ps(tau5, vs_4_5); -#ifdef __ELPA_USE_FMA__ - v1 = _mm256_FMSUB_ps(v1, tau5, _mm256_add_ps(_mm256_FMA_ps(w1, h5, _mm256_mul_ps(z1,h4)), _mm256_FMA_ps(y1, h3, _mm256_mul_ps(x1,h2)))); -#else - v1 = _mm256_sub_ps(_mm256_mul_ps(v1,tau5), _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(w1,h5), _mm256_mul_ps(z1,h4)), _mm256_add_ps(_mm256_mul_ps(y1,h3), _mm256_mul_ps(x1,h2)))); -#endif +// register __m512d x1 = a1_1; + __m512 x1 = a1_1; - __m256 tau6 = _mm256_broadcast_ss(&hh[ldh*5]); - __m256 vs_1_6 = _mm256_broadcast_ss(&scalarprods[10]); - __m256 vs_2_6 = _mm256_broadcast_ss(&scalarprods[11]); - h2 = _mm256_mul_ps(tau6, vs_1_6); - h3 = _mm256_mul_ps(tau6, vs_2_6); - __m256 vs_3_6 = _mm256_broadcast_ss(&scalarprods[12]); - __m256 vs_4_6 = _mm256_broadcast_ss(&scalarprods[13]); - __m256 vs_5_6 = _mm256_broadcast_ss(&scalarprods[14]); - h4 = _mm256_mul_ps(tau6, vs_3_6); - h5 = _mm256_mul_ps(tau6, vs_4_6); - h6 = _mm256_mul_ps(tau6, vs_5_6); -#ifdef __ELPA_USE_FMA__ - t1 = _mm256_FMSUB_ps(t1, tau6, _mm256_FMA_ps(v1, h6, _mm256_add_ps(_mm256_FMA_ps(w1, h5, _mm256_mul_ps(z1,h4)), _mm256_FMA_ps(y1, h3, _mm256_mul_ps(x1,h2))))); -#else - t1 = _mm256_sub_ps(_mm256_mul_ps(t1,tau6), _mm256_add_ps( _mm256_mul_ps(v1,h6), _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(w1,h5), _mm256_mul_ps(z1,h4)), _mm256_add_ps(_mm256_mul_ps(y1,h3), _mm256_mul_ps(x1,h2))))); -#endif + __m512 a1_2 = _mm512_load_ps(&q[(ldq*5)+16]); + __m512 a2_2 = _mm512_load_ps(&q[(ldq*4)+16]); + __m512 a3_2 = _mm512_load_ps(&q[(ldq*3)+16]); + __m512 a4_2 = _mm512_load_ps(&q[(ldq*2)+16]); + __m512 a5_2 = _mm512_load_ps(&q[(ldq)+16]); + __m512 a6_2 = _mm512_load_ps(&q[0+16]); - ///////////////////////////////////////////////////// - // Rank-1 update of Q [4 x nb+3] - ///////////////////////////////////////////////////// +// register __m512d t2 = _mm512_FMA_ps(a5_2, h_6_5, a6_2); + __m512 t2 = _mm512_FMA_ps(a5_2, h_6_5, a6_2); - q1 = _mm256_castps128_ps256(_mm_load_ps(&q[0])); - q1 = _mm256_sub_ps(q1, t1); - _mm_store_ps(&q[0], _mm256_castps256_ps128(q1)); - - h6 = _mm256_broadcast_ss(&hh[(ldh*5)+1]); - q1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq])); - q1 = _mm256_sub_ps(q1, v1); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(t1, h6, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(t1, h6)); -#endif - _mm_store_ps(&q[ldq], _mm256_castps256_ps128(q1)); - - h5 = _mm256_broadcast_ss(&hh[(ldh*4)+1]); - q1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*2])); - q1 = _mm256_sub_ps(q1, w1); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(v1, h5, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(v1, h5)); -#endif - h6 = _mm256_broadcast_ss(&hh[(ldh*5)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(t1, h6, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(t1, h6)); -#endif - _mm_store_ps(&q[ldq*2], _mm256_castps256_ps128(q1)); - - h4 = _mm256_broadcast_ss(&hh[(ldh*3)+1]); - q1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*3])); - q1 = _mm256_sub_ps(q1, z1); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(w1, h4, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(w1, h4)); -#endif - h5 = _mm256_broadcast_ss(&hh[(ldh*4)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(v1, h5, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(v1, h5)); -#endif - h6 = _mm256_broadcast_ss(&hh[(ldh*5)+3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(t1, h6, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(t1, h6)); -#endif - _mm_store_ps(&q[ldq*3], _mm256_castps256_ps128(q1)); - - h3 = _mm256_broadcast_ss(&hh[(ldh*2)+1]); - q1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*4])); - q1 = _mm256_sub_ps(q1, y1); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(z1, h3, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(z1, h3)); -#endif - h4 = _mm256_broadcast_ss(&hh[(ldh*3)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(w1, h4, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(w1, h4)); -#endif - h5 = _mm256_broadcast_ss(&hh[(ldh*4)+3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(v1, h5, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(v1, h5)); -#endif - h6 = _mm256_broadcast_ss(&hh[(ldh*5)+4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(t1, h6, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(t1, h6)); -#endif - _mm_store_ps(&q[ldq*4], _mm256_castps256_ps128(q1)); - - h2 = _mm256_broadcast_ss(&hh[(ldh)+1]); - q1 = _mm256_castps128_ps256(_mm_load_ps(&q[ldq*5])); - q1 = _mm256_sub_ps(q1, x1); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(y1, h2, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(y1, h2)); -#endif - h3 = _mm256_broadcast_ss(&hh[(ldh*2)+2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(z1, h3, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(z1, h3)); -#endif - h4 = _mm256_broadcast_ss(&hh[(ldh*3)+3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(w1, h4, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(w1, h4)); -#endif - h5 = _mm256_broadcast_ss(&hh[(ldh*4)+4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(v1, h5, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(v1, h5)); -#endif - h6 = _mm256_broadcast_ss(&hh[(ldh*5)+5]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(t1, h6, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(t1, h6)); -#endif - _mm_store_ps(&q[ldq*5], _mm256_castps256_ps128(q1)); + t2 = _mm512_FMA_ps(a4_2, h_6_4, t2); + t2 = _mm512_FMA_ps(a3_2, h_6_3, t2); + t2 = _mm512_FMA_ps(a2_2, h_6_2, t2); + t2 = _mm512_FMA_ps(a1_2, h_6_1, t2); + +// register __m512d v2 = _mm512_FMA_ps(a4_2, h_5_4, a5_2); + __m512 v2 = _mm512_FMA_ps(a4_2, h_5_4, a5_2); + + v2 = _mm512_FMA_ps(a3_2, h_5_3, v2); + v2 = _mm512_FMA_ps(a2_2, h_5_2, v2); + v2 = _mm512_FMA_ps(a1_2, h_5_1, v2); + +// register __m512d w2 = _mm512_FMA_ps(a3_2, h_4_3, a4_2); + __m512 w2 = _mm512_FMA_ps(a3_2, h_4_3, a4_2); + + w2 = _mm512_FMA_ps(a2_2, h_4_2, w2); + w2 = _mm512_FMA_ps(a1_2, h_4_1, w2); + +// register __m512d z2 = _mm512_FMA_ps(a2_2, h_3_2, a3_2); + __m512 z2 = _mm512_FMA_ps(a2_2, h_3_2, a3_2); + + z2 = _mm512_FMA_ps(a1_2, h_3_1, z2); +// register __m512d y2 = _mm512_FMA_ps(a1_2, h_2_1, a2_2); + __m512 y2 = _mm512_FMA_ps(a1_2, h_2_1, a2_2); + + +// register __m512d x2 = a1_2; + __m512 x2 = a1_2; + + __m512 a1_3 = _mm512_load_ps(&q[(ldq*5)+32]); + __m512 a2_3 = _mm512_load_ps(&q[(ldq*4)+32]); + __m512 a3_3 = _mm512_load_ps(&q[(ldq*3)+32]); + __m512 a4_3 = _mm512_load_ps(&q[(ldq*2)+32]); + __m512 a5_3 = _mm512_load_ps(&q[(ldq)+32]); + __m512 a6_3 = _mm512_load_ps(&q[0+32]); + +// register __m512d t3 = _mm512_FMA_ps(a5_3, h_6_5, a6_3); + __m512 t3 = _mm512_FMA_ps(a5_3, h_6_5, a6_3); + + t3 = _mm512_FMA_ps(a4_3, h_6_4, t3); + t3 = _mm512_FMA_ps(a3_3, h_6_3, t3); + t3 = _mm512_FMA_ps(a2_3, h_6_2, t3); + t3 = _mm512_FMA_ps(a1_3, h_6_1, t3); + +// register __m512d v3 = _mm512_FMA_ps(a4_3, h_5_4, a5_3); + __m512 v3 = _mm512_FMA_ps(a4_3, h_5_4, a5_3); + + v3 = _mm512_FMA_ps(a3_3, h_5_3, v3); + v3 = _mm512_FMA_ps(a2_3, h_5_2, v3); + v3 = _mm512_FMA_ps(a1_3, h_5_1, v3); + +// register __m512d w3 = _mm512_FMA_ps(a3_3, h_4_3, a4_3); + __m512 w3 = _mm512_FMA_ps(a3_3, h_4_3, a4_3); + + w3 = _mm512_FMA_ps(a2_3, h_4_2, w3); + w3 = _mm512_FMA_ps(a1_3, h_4_1, w3); + +// register __m512d z3 = _mm512_FMA_ps(a2_3, h_3_2, a3_3); + __m512 z3 = _mm512_FMA_ps(a2_3, h_3_2, a3_3); + + z3 = _mm512_FMA_ps(a1_3, h_3_1, z3); +// register __m512d y3 = _mm512_FMA_ps(a1_3, h_2_1, a2_3); + __m512 y3 = _mm512_FMA_ps(a1_3, h_2_1, a2_3); + + +// register __m512d x3 = a1_3; + __m512 x3 = a1_3; + + +// __m512 a1_4 = _mm512_load_ps(&q[(ldq*5)+48]); +// __m512 a2_4 = _mm512_load_ps(&q[(ldq*4)+48]); +// __m512 a3_4 = _mm512_load_ps(&q[(ldq*3)+48]); +// __m512 a4_4 = _mm512_load_ps(&q[(ldq*2)+48]); +// __m512 a5_4 = _mm512_load_ps(&q[(ldq)+48]); +// __m512 a6_4 = _mm512_load_ps(&q[0+48]); +// +//// register __m512d t4 = _mm512_FMA_ps(a5_4, h_6_5, a6_4); + // __m512 t4 = _mm512_FMA_ps(a5_4, h_6_5, a6_4); +// +// t4 = _mm512_FMA_ps(a4_4, h_6_4, t4); +// t4 = _mm512_FMA_ps(a3_4, h_6_3, t4); +// t4 = _mm512_FMA_ps(a2_4, h_6_2, t4); +// t4 = _mm512_FMA_ps(a1_4, h_6_1, t4); +// +//// register __m512d v4 = _mm512_FMA_ps(a4_4, h_5_4, a5_4); +// __m512 v4 = _mm512_FMA_ps(a4_4, h_5_4, a5_4); +// +// v4 = _mm512_FMA_ps(a3_4, h_5_3, v4); +// v4 = _mm512_FMA_ps(a2_4, h_5_2, v4); +// v4 = _mm512_FMA_ps(a1_4, h_5_1, v4); +// +//// register __m512d w4 = _mm512_FMA_ps(a3_4, h_4_3, a4_4); + // __m512 w4 = _mm512_FMA_ps(a3_4, h_4_3, a4_4); +// +// w4 = _mm512_FMA_ps(a2_4, h_4_2, w4); +// w4 = _mm512_FMA_ps(a1_4, h_4_1, w4); +// +//// register __m512d z4 = _mm512_FMA_ps(a2_4, h_3_2, a3_4); +// __m512 z4 = _mm512_FMA_ps(a2_4, h_3_2, a3_4); +// +// z4 = _mm512_FMA_ps(a1_4, h_3_1, z4); +//// register __m512d y4 = _mm512_FMA_ps(a1_4, h_2_1, a2_4); +// __m512 y4 = _mm512_FMA_ps(a1_4, h_2_1, a2_4); +// +// +//// register __m512d x4 = a1_4; +// __m512 x4 = a1_4; + + + __m512 q1; + __m512 q2; + __m512 q3; +// __m512 q4; + + __m512 h1; + __m512 h2; + __m512 h3; + __m512 h4; + __m512 h5; + __m512 h6; + + for(i = 6; i < nb; i++) + { + h1 = _mm512_set1_ps(hh[i-5]); + q1 = _mm512_load_ps(&q[i*ldq]); + q2 = _mm512_load_ps(&q[(i*ldq)+16]); + q3 = _mm512_load_ps(&q[(i*ldq)+32]); +// q4 = _mm512_load_ps(&q[(i*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); + x2 = _mm512_FMA_ps(q2, h1, x2); + x3 = _mm512_FMA_ps(q3, h1, x3); +// x4 = _mm512_FMA_ps(q4, h1, x4); + + h2 = _mm512_set1_ps(hh[ldh+i-4]); + + y1 = _mm512_FMA_ps(q1, h2, y1); + y2 = _mm512_FMA_ps(q2, h2, y2); + y3 = _mm512_FMA_ps(q3, h2, y3); +// y4 = _mm512_FMA_ps(q4, h2, y4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+i-3]); + + z1 = _mm512_FMA_ps(q1, h3, z1); + z2 = _mm512_FMA_ps(q2, h3, z2); + z3 = _mm512_FMA_ps(q3, h3, z3); +// z4 = _mm512_FMA_ps(q4, h3, z4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+i-2]); + + w1 = _mm512_FMA_ps(q1, h4, w1); + w2 = _mm512_FMA_ps(q2, h4, w2); + w3 = _mm512_FMA_ps(q3, h4, w3); +// w4 = _mm512_FMA_ps(q4, h4, w4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+i-1]); + + v1 = _mm512_FMA_ps(q1, h5, v1); + v2 = _mm512_FMA_ps(q2, h5, v2); + v3 = _mm512_FMA_ps(q3, h5, v3); +// v4 = _mm512_FMA_ps(q4, h5, v4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+i]); + + t1 = _mm512_FMA_ps(q1, h6, t1); + t2 = _mm512_FMA_ps(q2, h6, t2); + t3 = _mm512_FMA_ps(q3, h6, t3); +// t4 = _mm512_FMA_ps(q4, h6, t4); + } + + h1 = _mm512_set1_ps(hh[nb-5]); + q1 = _mm512_load_ps(&q[nb*ldq]); + q2 = _mm512_load_ps(&q[(nb*ldq)+16]); + q3 = _mm512_load_ps(&q[(nb*ldq)+32]); +// q4 = _mm512_load_ps(&q[(nb*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); + x2 = _mm512_FMA_ps(q2, h1, x2); + x3 = _mm512_FMA_ps(q3, h1, x3); +// x4 = _mm512_FMA_ps(q4, h1, x4); + + h2 = _mm512_set1_ps(hh[ldh+nb-4]); + + y1 = _mm512_FMA_ps(q1, h2, y1); + y2 = _mm512_FMA_ps(q2, h2, y2); + y3 = _mm512_FMA_ps(q3, h2, y3); +// y4 = _mm512_FMA_ps(q4, h2, y4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-3]); + + z1 = _mm512_FMA_ps(q1, h3, z1); + z2 = _mm512_FMA_ps(q2, h3, z2); + z3 = _mm512_FMA_ps(q3, h3, z3); +// z4 = _mm512_FMA_ps(q4, h3, z4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+nb-2]); + + w1 = _mm512_FMA_ps(q1, h4, w1); + w2 = _mm512_FMA_ps(q2, h4, w2); + w3 = _mm512_FMA_ps(q3, h4, w3); +// w4 = _mm512_FMA_ps(q4, h4, w4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+nb-1]); + + v1 = _mm512_FMA_ps(q1, h5, v1); + v2 = _mm512_FMA_ps(q2, h5, v2); + v3 = _mm512_FMA_ps(q3, h5, v3); +// v4 = _mm512_FMA_ps(q4, h5, v4); + + h1 = _mm512_set1_ps(hh[nb-4]); + + q1 = _mm512_load_ps(&q[(nb+1)*ldq]); + q2 = _mm512_load_ps(&q[((nb+1)*ldq)+16]); + q3 = _mm512_load_ps(&q[((nb+1)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+1)*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); + x2 = _mm512_FMA_ps(q2, h1, x2); + x3 = _mm512_FMA_ps(q3, h1, x3); +// x4 = _mm512_FMA_ps(q4, h1, x4); + + h2 = _mm512_set1_ps(hh[ldh+nb-3]); + + y1 = _mm512_FMA_ps(q1, h2, y1); + y2 = _mm512_FMA_ps(q2, h2, y2); + y3 = _mm512_FMA_ps(q3, h2, y3); +// y4 = _mm512_FMA_ps(q4, h2, y4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-2]); + + z1 = _mm512_FMA_ps(q1, h3, z1); + z2 = _mm512_FMA_ps(q2, h3, z2); + z3 = _mm512_FMA_ps(q3, h3, z3); +// z4 = _mm512_FMA_ps(q4, h3, z4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+nb-1]); + + w1 = _mm512_FMA_ps(q1, h4, w1); + w2 = _mm512_FMA_ps(q2, h4, w2); + w3 = _mm512_FMA_ps(q3, h4, w3); +// w4 = _mm512_FMA_ps(q4, h4, w4); + + h1 = _mm512_set1_ps(hh[nb-3]); + q1 = _mm512_load_ps(&q[(nb+2)*ldq]); + q2 = _mm512_load_ps(&q[((nb+2)*ldq)+16]); + q3 = _mm512_load_ps(&q[((nb+2)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+2)*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); + x2 = _mm512_FMA_ps(q2, h1, x2); + x3 = _mm512_FMA_ps(q3, h1, x3); +// x4 = _mm512_FMA_ps(q4, h1, x4); + + h2 = _mm512_set1_ps(hh[ldh+nb-2]); + + y1 = _mm512_FMA_ps(q1, h2, y1); + y2 = _mm512_FMA_ps(q2, h2, y2); + y3 = _mm512_FMA_ps(q3, h2, y3); +// y4 = _mm512_FMA_ps(q4, h2, y4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-1]); + + z1 = _mm512_FMA_ps(q1, h3, z1); + z2 = _mm512_FMA_ps(q2, h3, z2); + z3 = _mm512_FMA_ps(q3, h3, z3); +// z4 = _mm512_FMA_ps(q4, h3, z4); + + h1 = _mm512_set1_ps(hh[nb-2]); + q1 = _mm512_load_ps(&q[(nb+3)*ldq]); + q2 = _mm512_load_ps(&q[((nb+3)*ldq)+16]); + q3 = _mm512_load_ps(&q[((nb+3)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+3)*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); + x2 = _mm512_FMA_ps(q2, h1, x2); + x3 = _mm512_FMA_ps(q3, h1, x3); +// x4 = _mm512_FMA_ps(q4, h1, x4); + + h2 = _mm512_set1_ps(hh[ldh+nb-1]); + + y1 = _mm512_FMA_ps(q1, h2, y1); + y2 = _mm512_FMA_ps(q2, h2, y2); + y3 = _mm512_FMA_ps(q3, h2, y3); +// y4 = _mm512_FMA_ps(q4, h2, y4); + + h1 = _mm512_set1_ps(hh[nb-1]); + q1 = _mm512_load_ps(&q[(nb+4)*ldq]); + q2 = _mm512_load_ps(&q[((nb+4)*ldq)+16]); + q3 = _mm512_load_ps(&q[((nb+4)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+4)*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); + x2 = _mm512_FMA_ps(q2, h1, x2); + x3 = _mm512_FMA_ps(q3, h1, x3); +// x4 = _mm512_FMA_ps(q4, h1, x4); + + ///////////////////////////////////////////////////// + // Apply tau, correct wrong calculation using pre-calculated scalar products + ///////////////////////////////////////////////////// + + __m512 tau1 = _mm512_set1_ps(hh[0]); + x1 = _mm512_mul_ps(x1, tau1); + x2 = _mm512_mul_ps(x2, tau1); + x3 = _mm512_mul_ps(x3, tau1); +// x4 = _mm512_mul_ps(x4, tau1); + + __m512 tau2 = _mm512_set1_ps(hh[ldh]); + __m512 vs_1_2 = _mm512_set1_ps(scalarprods[0]); + h2 = _mm512_mul_ps(tau2, vs_1_2); + + y1 = _mm512_FMSUB_ps(y1, tau2, _mm512_mul_ps(x1,h2)); + y2 = _mm512_FMSUB_ps(y2, tau2, _mm512_mul_ps(x2,h2)); + y3 = _mm512_FMSUB_ps(y3, tau2, _mm512_mul_ps(x3,h2)); +// y4 = _mm512_FMSUB_ps(y4, tau2, _mm512_mul_ps(x4,h2)); + + __m512 tau3 = _mm512_set1_ps(hh[ldh*2]); + __m512 vs_1_3 = _mm512_set1_ps(scalarprods[1]); + __m512 vs_2_3 = _mm512_set1_ps(scalarprods[2]); + + h2 = _mm512_mul_ps(tau3, vs_1_3); + h3 = _mm512_mul_ps(tau3, vs_2_3); + + z1 = _mm512_FMSUB_ps(z1, tau3, _mm512_FMA_ps(y1, h3, _mm512_mul_ps(x1,h2))); + z2 = _mm512_FMSUB_ps(z2, tau3, _mm512_FMA_ps(y2, h3, _mm512_mul_ps(x2,h2))); + z3 = _mm512_FMSUB_ps(z3, tau3, _mm512_FMA_ps(y3, h3, _mm512_mul_ps(x3,h2))); +// z4 = _mm512_FMSUB_ps(z4, tau3, _mm512_FMA_ps(y4, h3, _mm512_mul_ps(x4,h2))); + + __m512 tau4 = _mm512_set1_ps(hh[ldh*3]); + __m512 vs_1_4 = _mm512_set1_ps(scalarprods[3]); + __m512 vs_2_4 = _mm512_set1_ps(scalarprods[4]); + + h2 = _mm512_mul_ps(tau4, vs_1_4); + h3 = _mm512_mul_ps(tau4, vs_2_4); + + __m512 vs_3_4 = _mm512_set1_ps(scalarprods[5]); + h4 = _mm512_mul_ps(tau4, vs_3_4); + + w1 = _mm512_FMSUB_ps(w1, tau4, _mm512_FMA_ps(z1, h4, _mm512_FMA_ps(y1, h3, _mm512_mul_ps(x1,h2)))); + w2 = _mm512_FMSUB_ps(w2, tau4, _mm512_FMA_ps(z2, h4, _mm512_FMA_ps(y2, h3, _mm512_mul_ps(x2,h2)))); + w3 = _mm512_FMSUB_ps(w3, tau4, _mm512_FMA_ps(z3, h4, _mm512_FMA_ps(y3, h3, _mm512_mul_ps(x3,h2)))); +// w4 = _mm512_FMSUB_ps(w4, tau4, _mm512_FMA_ps(z4, h4, _mm512_FMA_ps(y4, h3, _mm512_mul_ps(x4,h2)))); + + __m512 tau5 = _mm512_set1_ps(hh[ldh*4]); + __m512 vs_1_5 = _mm512_set1_ps(scalarprods[6]); + __m512 vs_2_5 = _mm512_set1_ps(scalarprods[7]); + + h2 = _mm512_mul_ps(tau5, vs_1_5); + h3 = _mm512_mul_ps(tau5, vs_2_5); + + __m512 vs_3_5 = _mm512_set1_ps(scalarprods[8]); + __m512 vs_4_5 = _mm512_set1_ps(scalarprods[9]); + + h4 = _mm512_mul_ps(tau5, vs_3_5); + h5 = _mm512_mul_ps(tau5, vs_4_5); + + v1 = _mm512_FMSUB_ps(v1, tau5, _mm512_add_ps(_mm512_FMA_ps(w1, h5, _mm512_mul_ps(z1,h4)), _mm512_FMA_ps(y1, h3, _mm512_mul_ps(x1,h2)))); + v2 = _mm512_FMSUB_ps(v2, tau5, _mm512_add_ps(_mm512_FMA_ps(w2, h5, _mm512_mul_ps(z2,h4)), _mm512_FMA_ps(y2, h3, _mm512_mul_ps(x2,h2)))); + v3 = _mm512_FMSUB_ps(v3, tau5, _mm512_add_ps(_mm512_FMA_ps(w3, h5, _mm512_mul_ps(z3,h4)), _mm512_FMA_ps(y3, h3, _mm512_mul_ps(x3,h2)))); +// v4 = _mm512_FMSUB_ps(v4, tau5, _mm512_add_ps(_mm512_FMA_ps(w4, h5, _mm512_mul_ps(z4,h4)), _mm512_FMA_ps(y4, h3, _mm512_mul_ps(x4,h2)))); + + __m512 tau6 = _mm512_set1_ps(hh[ldh*5]); + __m512 vs_1_6 = _mm512_set1_ps(scalarprods[10]); + __m512 vs_2_6 = _mm512_set1_ps(scalarprods[11]); + h2 = _mm512_mul_ps(tau6, vs_1_6); + h3 = _mm512_mul_ps(tau6, vs_2_6); + + __m512 vs_3_6 = _mm512_set1_ps(scalarprods[12]); + __m512 vs_4_6 = _mm512_set1_ps(scalarprods[13]); + __m512 vs_5_6 = _mm512_set1_ps(scalarprods[14]); + + h4 = _mm512_mul_ps(tau6, vs_3_6); + h5 = _mm512_mul_ps(tau6, vs_4_6); + h6 = _mm512_mul_ps(tau6, vs_5_6); + + t1 = _mm512_FMSUB_ps(t1, tau6, _mm512_FMA_ps(v1, h6, _mm512_add_ps(_mm512_FMA_ps(w1, h5, _mm512_mul_ps(z1,h4)), _mm512_FMA_ps(y1, h3, _mm512_mul_ps(x1,h2))))); + t2 = _mm512_FMSUB_ps(t2, tau6, _mm512_FMA_ps(v2, h6, _mm512_add_ps(_mm512_FMA_ps(w2, h5, _mm512_mul_ps(z2,h4)), _mm512_FMA_ps(y2, h3, _mm512_mul_ps(x2,h2))))); + t3 = _mm512_FMSUB_ps(t3, tau6, _mm512_FMA_ps(v3, h6, _mm512_add_ps(_mm512_FMA_ps(w3, h5, _mm512_mul_ps(z3,h4)), _mm512_FMA_ps(y3, h3, _mm512_mul_ps(x3,h2))))); +// t4 = _mm512_FMSUB_ps(t4, tau6, _mm512_FMA_ps(v4, h6, _mm512_add_ps(_mm512_FMA_ps(w4, h5, _mm512_mul_ps(z4,h4)), _mm512_FMA_ps(y4, h3, _mm512_mul_ps(x4,h2))))); + + + ///////////////////////////////////////////////////// + // Rank-1 update of Q [8 x nb+3] + ///////////////////////////////////////////////////// + + q1 = _mm512_load_ps(&q[0]); + q2 = _mm512_load_ps(&q[0+16]); + q3 = _mm512_load_ps(&q[0+32]); +// q4 = _mm512_load_ps(&q[0+48]); + + q1 = _mm512_sub_ps(q1, t1); + q2 = _mm512_sub_ps(q2, t2); + q3 = _mm512_sub_ps(q3, t3); +// q4 = _mm512_sub_ps(q4, t4); + + _mm512_store_ps(&q[0],q1); + _mm512_store_ps(&q[0+16],q2); + _mm512_store_ps(&q[0+32],q3); +// _mm512_store_ps(&q[0+48],q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+1]); + q1 = _mm512_load_ps(&q[ldq]); + q2 = _mm512_load_ps(&q[ldq+16]); + q3 = _mm512_load_ps(&q[ldq+32]); +// q4 = _mm512_load_ps(&q[ldq+48]); + + q1 = _mm512_sub_ps(q1, v1); + q2 = _mm512_sub_ps(q2, v2); + q3 = _mm512_sub_ps(q3, v3); +// q4 = _mm512_sub_ps(q4, v4); + + q1 = _mm512_NFMA_ps(t1, h6, q1); + q2 = _mm512_NFMA_ps(t2, h6, q2); + q3 = _mm512_NFMA_ps(t3, h6, q3); +// q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[ldq],q1); + _mm512_store_ps(&q[ldq+16],q2); + _mm512_store_ps(&q[ldq+32],q3); +// _mm512_store_ps(&q[ldq+48],q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+1]); + q1 = _mm512_load_ps(&q[ldq*2]); + q2 = _mm512_load_ps(&q[(ldq*2)+16]); + q3 = _mm512_load_ps(&q[(ldq*2)+32]); +// q4 = _mm512_load_ps(&q[(ldq*2)+48]); + + q1 = _mm512_sub_ps(q1, w1); + q2 = _mm512_sub_ps(q2, w2); + q3 = _mm512_sub_ps(q3, w3); +// q4 = _mm512_sub_ps(q4, w4); + + q1 = _mm512_NFMA_ps(v1, h5, q1); + q2 = _mm512_NFMA_ps(v2, h5, q2); + q3 = _mm512_NFMA_ps(v3, h5, q3); +// q4 = _mm512_NFMA_ps(v4, h5, q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+2]); + + q1 = _mm512_NFMA_ps(t1, h6, q1); + q2 = _mm512_NFMA_ps(t2, h6, q2); + q3 = _mm512_NFMA_ps(t3, h6, q3); +// q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[ldq*2],q1); + _mm512_store_ps(&q[(ldq*2)+16],q2); + _mm512_store_ps(&q[(ldq*2)+32],q3); +// _mm512_store_ps(&q[(ldq*2)+48],q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+1]); + q1 = _mm512_load_ps(&q[ldq*3]); + q2 = _mm512_load_ps(&q[(ldq*3)+16]); + q3 = _mm512_load_ps(&q[(ldq*3)+32]); +// q4 = _mm512_load_ps(&q[(ldq*3)+48]); + + q1 = _mm512_sub_ps(q1, z1); + q2 = _mm512_sub_ps(q2, z2); + q3 = _mm512_sub_ps(q3, z3); +// q4 = _mm512_sub_ps(q4, z4); + + q1 = _mm512_NFMA_ps(w1, h4, q1); + q2 = _mm512_NFMA_ps(w2, h4, q2); + q3 = _mm512_NFMA_ps(w3, h4, q3); +// q4 = _mm512_NFMA_ps(w4, h4, q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+2]); + + q1 = _mm512_NFMA_ps(v1, h5, q1); + q2 = _mm512_NFMA_ps(v2, h5, q2); + q3 = _mm512_NFMA_ps(v3, h5, q3); +// q4 = _mm512_NFMA_ps(v4, h5, q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+3]); + + q1 = _mm512_NFMA_ps(t1, h6, q1); + q2 = _mm512_NFMA_ps(t2, h6, q2); + q3 = _mm512_NFMA_ps(t3, h6, q3); +// q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[ldq*3],q1); + _mm512_store_ps(&q[(ldq*3)+16],q2); + _mm512_store_ps(&q[(ldq*3)+32],q3); +// _mm512_store_ps(&q[(ldq*3)+48],q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+1]); + q1 = _mm512_load_ps(&q[ldq*4]); + q2 = _mm512_load_ps(&q[(ldq*4)+16]); + q3 = _mm512_load_ps(&q[(ldq*4)+32]); +// q4 = _mm512_load_ps(&q[(ldq*4)+48]); + + q1 = _mm512_sub_ps(q1, y1); + q2 = _mm512_sub_ps(q2, y2); + q3 = _mm512_sub_ps(q3, y3); +// q4 = _mm512_sub_ps(q4, y4); + + q1 = _mm512_NFMA_ps(z1, h3, q1); + q2 = _mm512_NFMA_ps(z2, h3, q2); + q3 = _mm512_NFMA_ps(z3, h3, q3); +// q4 = _mm512_NFMA_ps(z4, h3, q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+2]); + + q1 = _mm512_NFMA_ps(w1, h4, q1); + q2 = _mm512_NFMA_ps(w2, h4, q2); + q3 = _mm512_NFMA_ps(w3, h4, q3); +// q4 = _mm512_NFMA_ps(w4, h4, q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+3]); + + q1 = _mm512_NFMA_ps(v1, h5, q1); + q2 = _mm512_NFMA_ps(v2, h5, q2); + q3 = _mm512_NFMA_ps(v3, h5, q3); +// q4 = _mm512_NFMA_ps(v4, h5, q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+4]); + + q1 = _mm512_NFMA_ps(t1, h6, q1); + q2 = _mm512_NFMA_ps(t2, h6, q2); + q3 = _mm512_NFMA_ps(t3, h6, q3); +// q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[ldq*4],q1); + _mm512_store_ps(&q[(ldq*4)+16],q2); + _mm512_store_ps(&q[(ldq*4)+32],q3); +// _mm512_store_ps(&q[(ldq*4)+48],q4); + + h2 = _mm512_set1_ps(hh[(ldh)+1]); + q1 = _mm512_load_ps(&q[ldq*5]); + q2 = _mm512_load_ps(&q[(ldq*5)+16]); + q3 = _mm512_load_ps(&q[(ldq*5)+32]); +// q4 = _mm512_load_ps(&q[(ldq*5)+48]); + + q1 = _mm512_sub_ps(q1, x1); + q2 = _mm512_sub_ps(q2, x2); + q3 = _mm512_sub_ps(q3, x3); +// q4 = _mm512_sub_ps(q4, x4); + + q1 = _mm512_NFMA_ps(y1, h2, q1); + q2 = _mm512_NFMA_ps(y2, h2, q2); + q3 = _mm512_NFMA_ps(y3, h2, q3); +// q4 = _mm512_NFMA_ps(y4, h2, q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+2]); + + q1 = _mm512_NFMA_ps(z1, h3, q1); + q2 = _mm512_NFMA_ps(z2, h3, q2); + q3 = _mm512_NFMA_ps(z3, h3, q3); +// q4 = _mm512_NFMA_ps(z4, h3, q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+3]); + + q1 = _mm512_NFMA_ps(w1, h4, q1); + q2 = _mm512_NFMA_ps(w2, h4, q2); + q3 = _mm512_NFMA_ps(w3, h4, q3); +// q4 = _mm512_NFMA_ps(w4, h4, q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+4]); + + q1 = _mm512_NFMA_ps(v1, h5, q1); + q2 = _mm512_NFMA_ps(v2, h5, q2); + q3 = _mm512_NFMA_ps(v3, h5, q3); +// q4 = _mm512_NFMA_ps(v4, h5, q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+5]); + + q1 = _mm512_NFMA_ps(t1, h6, q1); + q2 = _mm512_NFMA_ps(t2, h6, q2); + q3 = _mm512_NFMA_ps(t3, h6, q3); +// q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[ldq*5],q1); + _mm512_store_ps(&q[(ldq*5)+16],q2); + _mm512_store_ps(&q[(ldq*5)+32],q3); +// _mm512_store_ps(&q[(ldq*5)+48],q4); for (i = 6; i < nb; i++) { - q1 = _mm256_castps128_ps256(_mm_load_ps(&q[i*ldq])); - h1 = _mm256_broadcast_ss(&hh[i-5]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(x1, h1, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(x1, h1)); -#endif - h2 = _mm256_broadcast_ss(&hh[ldh+i-4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(y1, h2, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(y1, h2)); -#endif - h3 = _mm256_broadcast_ss(&hh[(ldh*2)+i-3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(z1, h3, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(z1, h3)); -#endif - h4 = _mm256_broadcast_ss(&hh[(ldh*3)+i-2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(w1, h4, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(w1, h4)); -#endif - h5 = _mm256_broadcast_ss(&hh[(ldh*4)+i-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(v1, h5, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(v1, h5)); -#endif - h6 = _mm256_broadcast_ss(&hh[(ldh*5)+i]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(t1, h6, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(t1, h6)); -#endif - _mm_store_ps(&q[i*ldq], _mm256_castps256_ps128(q1)); + q1 = _mm512_load_ps(&q[i*ldq]); + q2 = _mm512_load_ps(&q[(i*ldq)+16]); + q3 = _mm512_load_ps(&q[(i*ldq)+32]); +// q4 = _mm512_load_ps(&q[(i*ldq)+48]); + + h1 = _mm512_set1_ps(hh[i-5]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); + q2 = _mm512_NFMA_ps(x2, h1, q2); + q3 = _mm512_NFMA_ps(x3, h1, q3); +// q4 = _mm512_NFMA_ps(x4, h1, q4); + + h2 = _mm512_set1_ps(hh[ldh+i-4]); + + q1 = _mm512_NFMA_ps(y1, h2, q1); + q2 = _mm512_NFMA_ps(y2, h2, q2); + q3 = _mm512_NFMA_ps(y3, h2, q3); + // q4 = _mm512_NFMA_ps(y4, h2, q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+i-3]); + + q1 = _mm512_NFMA_ps(z1, h3, q1); + q2 = _mm512_NFMA_ps(z2, h3, q2); + q3 = _mm512_NFMA_ps(z3, h3, q3); +// q4 = _mm512_NFMA_ps(z4, h3, q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+i-2]); + + q1 = _mm512_NFMA_ps(w1, h4, q1); + q2 = _mm512_NFMA_ps(w2, h4, q2); + q3 = _mm512_NFMA_ps(w3, h4, q3); +// q4 = _mm512_NFMA_ps(w4, h4, q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+i-1]); + + q1 = _mm512_NFMA_ps(v1, h5, q1); + q2 = _mm512_NFMA_ps(v2, h5, q2); + q3 = _mm512_NFMA_ps(v3, h5, q3); +// q4 = _mm512_NFMA_ps(v4, h5, q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+i]); + + q1 = _mm512_NFMA_ps(t1, h6, q1); + q2 = _mm512_NFMA_ps(t2, h6, q2); + q3 = _mm512_NFMA_ps(t3, h6, q3); +// q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[i*ldq],q1); + _mm512_store_ps(&q[(i*ldq)+16],q2); + _mm512_store_ps(&q[(i*ldq)+32],q3); +// _mm512_store_ps(&q[(i*ldq)+48],q4); + } - h1 = _mm256_broadcast_ss(&hh[nb-5]); - q1 = _mm256_castps128_ps256(_mm_load_ps(&q[nb*ldq])); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(x1, h1, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(x1, h1)); -#endif - h2 = _mm256_broadcast_ss(&hh[ldh+nb-4]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(y1, h2, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(y1, h2)); -#endif - h3 = _mm256_broadcast_ss(&hh[(ldh*2)+nb-3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(z1, h3, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(z1, h3)); -#endif - h4 = _mm256_broadcast_ss(&hh[(ldh*3)+nb-2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(w1, h4, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(w1, h4)); -#endif - h5 = _mm256_broadcast_ss(&hh[(ldh*4)+nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(v1, h5, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(v1, h5)); -#endif - _mm_store_ps(&q[nb*ldq], _mm256_castps256_ps128(q1)); - - h1 = _mm256_broadcast_ss(&hh[nb-4]); - q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+1)*ldq])); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(x1, h1, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(x1, h1)); -#endif - h2 = _mm256_broadcast_ss(&hh[ldh+nb-3]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(y1, h2, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(y1, h2)); -#endif - h3 = _mm256_broadcast_ss(&hh[(ldh*2)+nb-2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(z1, h3, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(z1, h3)); -#endif - h4 = _mm256_broadcast_ss(&hh[(ldh*3)+nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(w1, h4, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(w1, h4)); -#endif - _mm_store_ps(&q[(nb+1)*ldq], _mm256_castps256_ps128(q1)); - - h1 = _mm256_broadcast_ss(&hh[nb-3]); - q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+2)*ldq])); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(x1, h1, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(x1, h1)); -#endif - h2 = _mm256_broadcast_ss(&hh[ldh+nb-2]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(y1, h2, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(y1, h2)); -#endif - h3 = _mm256_broadcast_ss(&hh[(ldh*2)+nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(z1, h3, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(z1, h3)); -#endif - _mm_store_ps(&q[(nb+2)*ldq], _mm256_castps256_ps128(q1)); - - h1 = _mm256_broadcast_ss(&hh[nb-2]); - q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+3)*ldq])); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(x1, h1, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(x1, h1)); -#endif - h2 = _mm256_broadcast_ss(&hh[ldh+nb-1]); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(y1, h2, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(y1, h2)); -#endif - _mm_store_ps(&q[(nb+3)*ldq], _mm256_castps256_ps128(q1)); - - h1 = _mm256_broadcast_ss(&hh[nb-1]); - q1 = _mm256_castps128_ps256(_mm_load_ps(&q[(nb+4)*ldq])); -#ifdef __ELPA_USE_FMA__ - q1 = _mm256_NFMA_ps(x1, h1, q1); -#else - q1 = _mm256_sub_ps(q1, _mm256_mul_ps(x1, h1)); -#endif - _mm_store_ps(&q[(nb+4)*ldq], _mm256_castps256_ps128(q1)); + h1 = _mm512_set1_ps(hh[nb-5]); + q1 = _mm512_load_ps(&q[nb*ldq]); + q2 = _mm512_load_ps(&q[(nb*ldq)+16]); + q3 = _mm512_load_ps(&q[(nb*ldq)+32]); +// q4 = _mm512_load_ps(&q[(nb*ldq)+48]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); + q2 = _mm512_NFMA_ps(x2, h1, q2); + q3 = _mm512_NFMA_ps(x3, h1, q3); +// q4 = _mm512_NFMA_ps(x4, h1, q4); + + h2 = _mm512_set1_ps(hh[ldh+nb-4]); + + q1 = _mm512_NFMA_ps(y1, h2, q1); + q2 = _mm512_NFMA_ps(y2, h2, q2); + q3 = _mm512_NFMA_ps(y3, h2, q3); +// q4 = _mm512_NFMA_ps(y4, h2, q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-3]); + + q1 = _mm512_NFMA_ps(z1, h3, q1); + q2 = _mm512_NFMA_ps(z2, h3, q2); + q3 = _mm512_NFMA_ps(z3, h3, q3); +// q4 = _mm512_NFMA_ps(z4, h3, q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+nb-2]); + + q1 = _mm512_NFMA_ps(w1, h4, q1); + q2 = _mm512_NFMA_ps(w2, h4, q2); + q3 = _mm512_NFMA_ps(w3, h4, q3); +// q4 = _mm512_NFMA_ps(w4, h4, q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+nb-1]); + + q1 = _mm512_NFMA_ps(v1, h5, q1); + q2 = _mm512_NFMA_ps(v2, h5, q2); + q3 = _mm512_NFMA_ps(v3, h5, q3); +// q4 = _mm512_NFMA_ps(v4, h5, q4); + + _mm512_store_ps(&q[nb*ldq],q1); + _mm512_store_ps(&q[(nb*ldq)+16],q2); + _mm512_store_ps(&q[(nb*ldq)+32],q3); +// _mm512_store_ps(&q[(nb*ldq)+48],q4); + + h1 = _mm512_set1_ps(hh[nb-4]); + q1 = _mm512_load_ps(&q[(nb+1)*ldq]); + q2 = _mm512_load_ps(&q[((nb+1)*ldq)+16]); + q3 = _mm512_load_ps(&q[((nb+1)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+1)*ldq)+48]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); + q2 = _mm512_NFMA_ps(x2, h1, q2); + q3 = _mm512_NFMA_ps(x3, h1, q3); +// q4 = _mm512_NFMA_ps(x4, h1, q4); + + h2 = _mm512_set1_ps(hh[ldh+nb-3]); + + q1 = _mm512_NFMA_ps(y1, h2, q1); + q2 = _mm512_NFMA_ps(y2, h2, q2); + q3 = _mm512_NFMA_ps(y3, h2, q3); +// q4 = _mm512_NFMA_ps(y4, h2, q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-2]); + + q1 = _mm512_NFMA_ps(z1, h3, q1); + q2 = _mm512_NFMA_ps(z2, h3, q2); + q3 = _mm512_NFMA_ps(z3, h3, q3); +// q4 = _mm512_NFMA_ps(z4, h3, q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+nb-1]); + + q1 = _mm512_NFMA_ps(w1, h4, q1); + q2 = _mm512_NFMA_ps(w2, h4, q2); + q3 = _mm512_NFMA_ps(w3, h4, q3); +// q4 = _mm512_NFMA_ps(w4, h4, q4); + + _mm512_store_ps(&q[(nb+1)*ldq],q1); + _mm512_store_ps(&q[((nb+1)*ldq)+16],q2); + _mm512_store_ps(&q[((nb+1)*ldq)+32],q3); +// _mm512_store_ps(&q[((nb+1)*ldq)+48],q4); + + h1 = _mm512_set1_ps(hh[nb-3]); + q1 = _mm512_load_ps(&q[(nb+2)*ldq]); + q2 = _mm512_load_ps(&q[((nb+2)*ldq)+16]); + q3 = _mm512_load_ps(&q[((nb+2)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+2)*ldq)+48]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); + q2 = _mm512_NFMA_ps(x2, h1, q2); + q3 = _mm512_NFMA_ps(x3, h1, q3); +// q4 = _mm512_NFMA_ps(x4, h1, q4); + + h2 = _mm512_set1_ps(hh[ldh+nb-2]); + + q1 = _mm512_NFMA_ps(y1, h2, q1); + q2 = _mm512_NFMA_ps(y2, h2, q2); + q3 = _mm512_NFMA_ps(y3, h2, q3); +// q4 = _mm512_NFMA_ps(y4, h2, q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-1]); + + q1 = _mm512_NFMA_ps(z1, h3, q1); + q2 = _mm512_NFMA_ps(z2, h3, q2); + q3 = _mm512_NFMA_ps(z3, h3, q3); +// q4 = _mm512_NFMA_ps(z4, h3, q4); + + _mm512_store_ps(&q[(nb+2)*ldq],q1); + _mm512_store_ps(&q[((nb+2)*ldq)+16],q2); + _mm512_store_ps(&q[((nb+2)*ldq)+32],q3); +// _mm512_store_ps(&q[((nb+2)*ldq)+48],q4); + + h1 = _mm512_set1_ps(hh[nb-2]); + q1 = _mm512_load_ps(&q[(nb+3)*ldq]); + q2 = _mm512_load_ps(&q[((nb+3)*ldq)+16]); + q3 = _mm512_load_ps(&q[((nb+3)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+3)*ldq)+48]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); + q2 = _mm512_NFMA_ps(x2, h1, q2); + q3 = _mm512_NFMA_ps(x3, h1, q3); +// q4 = _mm512_NFMA_ps(x4, h1, q4); + + h2 = _mm512_set1_ps(hh[ldh+nb-1]); + + q1 = _mm512_NFMA_ps(y1, h2, q1); + q2 = _mm512_NFMA_ps(y2, h2, q2); + q3 = _mm512_NFMA_ps(y3, h2, q3); +// q4 = _mm512_NFMA_ps(y4, h2, q4); + + _mm512_store_ps(&q[(nb+3)*ldq],q1); + _mm512_store_ps(&q[((nb+3)*ldq)+16],q2); + _mm512_store_ps(&q[((nb+3)*ldq)+32],q3); +// _mm512_store_ps(&q[((nb+3)*ldq)+48],q4); + + h1 = _mm512_set1_ps(hh[nb-1]); + q1 = _mm512_load_ps(&q[(nb+4)*ldq]); + q2 = _mm512_load_ps(&q[((nb+4)*ldq)+16]); + q3 = _mm512_load_ps(&q[((nb+4)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+4)*ldq)+48]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); + q2 = _mm512_NFMA_ps(x2, h1, q2); + q3 = _mm512_NFMA_ps(x3, h1, q3); +// q4 = _mm512_NFMA_ps(x4, h1, q4); + + _mm512_store_ps(&q[(nb+4)*ldq],q1); + _mm512_store_ps(&q[((nb+4)*ldq)+16],q2); + _mm512_store_ps(&q[((nb+4)*ldq)+32],q3); +// _mm512_store_ps(&q[((nb+4)*ldq)+48],q4); + +} + + +/** + * Unrolled kernel that computes + * 32 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 1 update is performed + */ +__forceinline void hh_trafo_kernel_32_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [8 x nb+3] * hh + // hh contains four householder vectors + ///////////////////////////////////////////////////// + int i; + + __m512 a1_1 = _mm512_load_ps(&q[ldq*5]); + __m512 a2_1 = _mm512_load_ps(&q[ldq*4]); + __m512 a3_1 = _mm512_load_ps(&q[ldq*3]); + __m512 a4_1 = _mm512_load_ps(&q[ldq*2]); + __m512 a5_1 = _mm512_load_ps(&q[ldq]); + __m512 a6_1 = _mm512_load_ps(&q[0]); + + __m512 h_6_5 = _mm512_set1_ps(hh[(ldh*5)+1]); + __m512 h_6_4 = _mm512_set1_ps(hh[(ldh*5)+2]); + __m512 h_6_3 = _mm512_set1_ps(hh[(ldh*5)+3]); + __m512 h_6_2 = _mm512_set1_ps(hh[(ldh*5)+4]); + __m512 h_6_1 = _mm512_set1_ps(hh[(ldh*5)+5]); + +// register __m512d t1 = _mm512_FMA_ps(a5_1, h_6_5, a6_1); + __m512 t1 = _mm512_FMA_ps(a5_1, h_6_5, a6_1); + + t1 = _mm512_FMA_ps(a4_1, h_6_4, t1); + t1 = _mm512_FMA_ps(a3_1, h_6_3, t1); + t1 = _mm512_FMA_ps(a2_1, h_6_2, t1); + t1 = _mm512_FMA_ps(a1_1, h_6_1, t1); + + __m512 h_5_4 = _mm512_set1_ps(hh[(ldh*4)+1]); + __m512 h_5_3 = _mm512_set1_ps(hh[(ldh*4)+2]); + __m512 h_5_2 = _mm512_set1_ps(hh[(ldh*4)+3]); + __m512 h_5_1 = _mm512_set1_ps(hh[(ldh*4)+4]); + +// register __m512d v1 = _mm512_FMA_ps(a4_1, h_5_4, a5_1); + __m512 v1 = _mm512_FMA_ps(a4_1, h_5_4, a5_1); + + v1 = _mm512_FMA_ps(a3_1, h_5_3, v1); + v1 = _mm512_FMA_ps(a2_1, h_5_2, v1); + v1 = _mm512_FMA_ps(a1_1, h_5_1, v1); + + __m512 h_4_3 = _mm512_set1_ps(hh[(ldh*3)+1]); + __m512 h_4_2 = _mm512_set1_ps(hh[(ldh*3)+2]); + __m512 h_4_1 = _mm512_set1_ps(hh[(ldh*3)+3]); + +// register __m512d w1 = _mm512_FMA_ps(a3_1, h_4_3, a4_1); + __m512 w1 = _mm512_FMA_ps(a3_1, h_4_3, a4_1); + + w1 = _mm512_FMA_ps(a2_1, h_4_2, w1); + w1 = _mm512_FMA_ps(a1_1, h_4_1, w1); + + __m512 h_2_1 = _mm512_set1_ps(hh[ldh+1]); + __m512 h_3_2 = _mm512_set1_ps(hh[(ldh*2)+1]); + __m512 h_3_1 = _mm512_set1_ps(hh[(ldh*2)+2]); + +// register __m512d z1 = _mm512_FMA_ps(a2_1, h_3_2, a3_1); + __m512 z1 = _mm512_FMA_ps(a2_1, h_3_2, a3_1); + + z1 = _mm512_FMA_ps(a1_1, h_3_1, z1); +// register __m512d y1 = _mm512_FMA_ps(a1_1, h_2_1, a2_1); + __m512 y1 = _mm512_FMA_ps(a1_1, h_2_1, a2_1); + + +// register __m512d x1 = a1_1; + __m512 x1 = a1_1; + + __m512 a1_2 = _mm512_load_ps(&q[(ldq*5)+16]); + __m512 a2_2 = _mm512_load_ps(&q[(ldq*4)+16]); + __m512 a3_2 = _mm512_load_ps(&q[(ldq*3)+16]); + __m512 a4_2 = _mm512_load_ps(&q[(ldq*2)+16]); + __m512 a5_2 = _mm512_load_ps(&q[(ldq)+16]); + __m512 a6_2 = _mm512_load_ps(&q[0+16]); + +// register __m512d t2 = _mm512_FMA_ps(a5_2, h_6_5, a6_2); + __m512 t2 = _mm512_FMA_ps(a5_2, h_6_5, a6_2); + + t2 = _mm512_FMA_ps(a4_2, h_6_4, t2); + t2 = _mm512_FMA_ps(a3_2, h_6_3, t2); + t2 = _mm512_FMA_ps(a2_2, h_6_2, t2); + t2 = _mm512_FMA_ps(a1_2, h_6_1, t2); + +// register __m512d v2 = _mm512_FMA_ps(a4_2, h_5_4, a5_2); + __m512 v2 = _mm512_FMA_ps(a4_2, h_5_4, a5_2); + + v2 = _mm512_FMA_ps(a3_2, h_5_3, v2); + v2 = _mm512_FMA_ps(a2_2, h_5_2, v2); + v2 = _mm512_FMA_ps(a1_2, h_5_1, v2); + +// register __m512d w2 = _mm512_FMA_ps(a3_2, h_4_3, a4_2); + __m512 w2 = _mm512_FMA_ps(a3_2, h_4_3, a4_2); + + w2 = _mm512_FMA_ps(a2_2, h_4_2, w2); + w2 = _mm512_FMA_ps(a1_2, h_4_1, w2); + +// register __m512d z2 = _mm512_FMA_ps(a2_2, h_3_2, a3_2); + __m512 z2 = _mm512_FMA_ps(a2_2, h_3_2, a3_2); + + z2 = _mm512_FMA_ps(a1_2, h_3_1, z2); +// register __m512d y2 = _mm512_FMA_ps(a1_2, h_2_1, a2_2); + __m512 y2 = _mm512_FMA_ps(a1_2, h_2_1, a2_2); + + +// register __m512d x2 = a1_2; + __m512 x2 = a1_2; + +// __m512 a1_3 = _mm512_load_ps(&q[(ldq*5)+32]); +// __m512 a2_3 = _mm512_load_ps(&q[(ldq*4)+32]); +// __m512 a3_3 = _mm512_load_ps(&q[(ldq*3)+32]); +// __m512 a4_3 = _mm512_load_ps(&q[(ldq*2)+32]); +// __m512 a5_3 = _mm512_load_ps(&q[(ldq)+32]); +// __m512 a6_3 = _mm512_load_ps(&q[0+32]); +// +//// register __m512d t3 = _mm512_FMA_ps(a5_3, h_6_5, a6_3); +// __m512 t3 = _mm512_FMA_ps(a5_3, h_6_5, a6_3); +// +// t3 = _mm512_FMA_ps(a4_3, h_6_4, t3); +// t3 = _mm512_FMA_ps(a3_3, h_6_3, t3); +// t3 = _mm512_FMA_ps(a2_3, h_6_2, t3); +// t3 = _mm512_FMA_ps(a1_3, h_6_1, t3); +// +//// register __m512d v3 = _mm512_FMA_ps(a4_3, h_5_4, a5_3); +// __m512 v3 = _mm512_FMA_ps(a4_3, h_5_4, a5_3); +// +// v3 = _mm512_FMA_ps(a3_3, h_5_3, v3); +// v3 = _mm512_FMA_ps(a2_3, h_5_2, v3); +// v3 = _mm512_FMA_ps(a1_3, h_5_1, v3); +// +//// register __m512d w3 = _mm512_FMA_ps(a3_3, h_4_3, a4_3); +// __m512 w3 = _mm512_FMA_ps(a3_3, h_4_3, a4_3); +// +// w3 = _mm512_FMA_ps(a2_3, h_4_2, w3); +// w3 = _mm512_FMA_ps(a1_3, h_4_1, w3); +// +//// register __m512d z3 = _mm512_FMA_ps(a2_3, h_3_2, a3_3); +// __m512 z3 = _mm512_FMA_ps(a2_3, h_3_2, a3_3); +// +// z3 = _mm512_FMA_ps(a1_3, h_3_1, z3); +//// register __m512d y3 = _mm512_FMA_ps(a1_3, h_2_1, a2_3); +// __m512 y3 = _mm512_FMA_ps(a1_3, h_2_1, a2_3); +// +// +//// register __m512d x3 = a1_3; +// __m512 x3 = a1_3; + + +// __m512 a1_4 = _mm512_load_ps(&q[(ldq*5)+48]); +// __m512 a2_4 = _mm512_load_ps(&q[(ldq*4)+48]); +// __m512 a3_4 = _mm512_load_ps(&q[(ldq*3)+48]); +// __m512 a4_4 = _mm512_load_ps(&q[(ldq*2)+48]); +// __m512 a5_4 = _mm512_load_ps(&q[(ldq)+48]); +// __m512 a6_4 = _mm512_load_ps(&q[0+48]); +// +//// register __m512d t4 = _mm512_FMA_ps(a5_4, h_6_5, a6_4); + // __m512 t4 = _mm512_FMA_ps(a5_4, h_6_5, a6_4); +// +// t4 = _mm512_FMA_ps(a4_4, h_6_4, t4); +// t4 = _mm512_FMA_ps(a3_4, h_6_3, t4); +// t4 = _mm512_FMA_ps(a2_4, h_6_2, t4); +// t4 = _mm512_FMA_ps(a1_4, h_6_1, t4); +// +//// register __m512d v4 = _mm512_FMA_ps(a4_4, h_5_4, a5_4); +// __m512 v4 = _mm512_FMA_ps(a4_4, h_5_4, a5_4); +// +// v4 = _mm512_FMA_ps(a3_4, h_5_3, v4); +// v4 = _mm512_FMA_ps(a2_4, h_5_2, v4); +// v4 = _mm512_FMA_ps(a1_4, h_5_1, v4); +// +//// register __m512d w4 = _mm512_FMA_ps(a3_4, h_4_3, a4_4); + // __m512 w4 = _mm512_FMA_ps(a3_4, h_4_3, a4_4); +// +// w4 = _mm512_FMA_ps(a2_4, h_4_2, w4); +// w4 = _mm512_FMA_ps(a1_4, h_4_1, w4); +// +//// register __m512d z4 = _mm512_FMA_ps(a2_4, h_3_2, a3_4); +// __m512 z4 = _mm512_FMA_ps(a2_4, h_3_2, a3_4); +// +// z4 = _mm512_FMA_ps(a1_4, h_3_1, z4); +//// register __m512d y4 = _mm512_FMA_ps(a1_4, h_2_1, a2_4); +// __m512 y4 = _mm512_FMA_ps(a1_4, h_2_1, a2_4); +// +// +//// register __m512d x4 = a1_4; +// __m512 x4 = a1_4; + + + __m512 q1; + __m512 q2; +// __m512 q3; +// __m512 q4; + + __m512 h1; + __m512 h2; + __m512 h3; + __m512 h4; + __m512 h5; + __m512 h6; + + for(i = 6; i < nb; i++) + { + h1 = _mm512_set1_ps(hh[i-5]); + q1 = _mm512_load_ps(&q[i*ldq]); + q2 = _mm512_load_ps(&q[(i*ldq)+16]); +// q3 = _mm512_load_ps(&q[(i*ldq)+32]); +// q4 = _mm512_load_ps(&q[(i*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); + x2 = _mm512_FMA_ps(q2, h1, x2); +// x3 = _mm512_FMA_ps(q3, h1, x3); +// x4 = _mm512_FMA_ps(q4, h1, x4); + + h2 = _mm512_set1_ps(hh[ldh+i-4]); + + y1 = _mm512_FMA_ps(q1, h2, y1); + y2 = _mm512_FMA_ps(q2, h2, y2); +// y3 = _mm512_FMA_ps(q3, h2, y3); +// y4 = _mm512_FMA_ps(q4, h2, y4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+i-3]); + + z1 = _mm512_FMA_ps(q1, h3, z1); + z2 = _mm512_FMA_ps(q2, h3, z2); +// z3 = _mm512_FMA_ps(q3, h3, z3); +// z4 = _mm512_FMA_ps(q4, h3, z4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+i-2]); + + w1 = _mm512_FMA_ps(q1, h4, w1); + w2 = _mm512_FMA_ps(q2, h4, w2); +// w3 = _mm512_FMA_ps(q3, h4, w3); +// w4 = _mm512_FMA_ps(q4, h4, w4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+i-1]); + + v1 = _mm512_FMA_ps(q1, h5, v1); + v2 = _mm512_FMA_ps(q2, h5, v2); +// v3 = _mm512_FMA_ps(q3, h5, v3); +// v4 = _mm512_FMA_ps(q4, h5, v4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+i]); + + t1 = _mm512_FMA_ps(q1, h6, t1); + t2 = _mm512_FMA_ps(q2, h6, t2); +// t3 = _mm512_FMA_ps(q3, h6, t3); +// t4 = _mm512_FMA_ps(q4, h6, t4); + } + + h1 = _mm512_set1_ps(hh[nb-5]); + q1 = _mm512_load_ps(&q[nb*ldq]); + q2 = _mm512_load_ps(&q[(nb*ldq)+16]); +// q3 = _mm512_load_ps(&q[(nb*ldq)+32]); +// q4 = _mm512_load_ps(&q[(nb*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); + x2 = _mm512_FMA_ps(q2, h1, x2); +// x3 = _mm512_FMA_ps(q3, h1, x3); +// x4 = _mm512_FMA_ps(q4, h1, x4); + + h2 = _mm512_set1_ps(hh[ldh+nb-4]); + + y1 = _mm512_FMA_ps(q1, h2, y1); + y2 = _mm512_FMA_ps(q2, h2, y2); +// y3 = _mm512_FMA_ps(q3, h2, y3); +// y4 = _mm512_FMA_ps(q4, h2, y4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-3]); + + z1 = _mm512_FMA_ps(q1, h3, z1); + z2 = _mm512_FMA_ps(q2, h3, z2); +// z3 = _mm512_FMA_ps(q3, h3, z3); +// z4 = _mm512_FMA_ps(q4, h3, z4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+nb-2]); + + w1 = _mm512_FMA_ps(q1, h4, w1); + w2 = _mm512_FMA_ps(q2, h4, w2); +// w3 = _mm512_FMA_ps(q3, h4, w3); +// w4 = _mm512_FMA_ps(q4, h4, w4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+nb-1]); + + v1 = _mm512_FMA_ps(q1, h5, v1); + v2 = _mm512_FMA_ps(q2, h5, v2); +// v3 = _mm512_FMA_ps(q3, h5, v3); +// v4 = _mm512_FMA_ps(q4, h5, v4); + + h1 = _mm512_set1_ps(hh[nb-4]); + + q1 = _mm512_load_ps(&q[(nb+1)*ldq]); + q2 = _mm512_load_ps(&q[((nb+1)*ldq)+16]); +// q3 = _mm512_load_ps(&q[((nb+1)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+1)*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); + x2 = _mm512_FMA_ps(q2, h1, x2); +// x3 = _mm512_FMA_ps(q3, h1, x3); +// x4 = _mm512_FMA_ps(q4, h1, x4); + + h2 = _mm512_set1_ps(hh[ldh+nb-3]); + + y1 = _mm512_FMA_ps(q1, h2, y1); + y2 = _mm512_FMA_ps(q2, h2, y2); +// y3 = _mm512_FMA_ps(q3, h2, y3); +// y4 = _mm512_FMA_ps(q4, h2, y4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-2]); + + z1 = _mm512_FMA_ps(q1, h3, z1); + z2 = _mm512_FMA_ps(q2, h3, z2); +// z3 = _mm512_FMA_ps(q3, h3, z3); +// z4 = _mm512_FMA_ps(q4, h3, z4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+nb-1]); + + w1 = _mm512_FMA_ps(q1, h4, w1); + w2 = _mm512_FMA_ps(q2, h4, w2); +// w3 = _mm512_FMA_ps(q3, h4, w3); +// w4 = _mm512_FMA_ps(q4, h4, w4); + + h1 = _mm512_set1_ps(hh[nb-3]); + q1 = _mm512_load_ps(&q[(nb+2)*ldq]); + q2 = _mm512_load_ps(&q[((nb+2)*ldq)+16]); +// q3 = _mm512_load_ps(&q[((nb+2)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+2)*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); + x2 = _mm512_FMA_ps(q2, h1, x2); +// x3 = _mm512_FMA_ps(q3, h1, x3); +// x4 = _mm512_FMA_ps(q4, h1, x4); + + h2 = _mm512_set1_ps(hh[ldh+nb-2]); + + y1 = _mm512_FMA_ps(q1, h2, y1); + y2 = _mm512_FMA_ps(q2, h2, y2); +// y3 = _mm512_FMA_ps(q3, h2, y3); +// y4 = _mm512_FMA_ps(q4, h2, y4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-1]); + + z1 = _mm512_FMA_ps(q1, h3, z1); + z2 = _mm512_FMA_ps(q2, h3, z2); +// z3 = _mm512_FMA_ps(q3, h3, z3); +// z4 = _mm512_FMA_ps(q4, h3, z4); + + h1 = _mm512_set1_ps(hh[nb-2]); + q1 = _mm512_load_ps(&q[(nb+3)*ldq]); + q2 = _mm512_load_ps(&q[((nb+3)*ldq)+16]); +// q3 = _mm512_load_ps(&q[((nb+3)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+3)*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); + x2 = _mm512_FMA_ps(q2, h1, x2); +// x3 = _mm512_FMA_ps(q3, h1, x3); +// x4 = _mm512_FMA_ps(q4, h1, x4); + + h2 = _mm512_set1_ps(hh[ldh+nb-1]); + + y1 = _mm512_FMA_ps(q1, h2, y1); + y2 = _mm512_FMA_ps(q2, h2, y2); +// y3 = _mm512_FMA_ps(q3, h2, y3); +// y4 = _mm512_FMA_ps(q4, h2, y4); + + h1 = _mm512_set1_ps(hh[nb-1]); + q1 = _mm512_load_ps(&q[(nb+4)*ldq]); + q2 = _mm512_load_ps(&q[((nb+4)*ldq)+16]); +// q3 = _mm512_load_ps(&q[((nb+4)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+4)*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); + x2 = _mm512_FMA_ps(q2, h1, x2); +// x3 = _mm512_FMA_ps(q3, h1, x3); +// x4 = _mm512_FMA_ps(q4, h1, x4); + + ///////////////////////////////////////////////////// + // Apply tau, correct wrong calculation using pre-calculated scalar products + ///////////////////////////////////////////////////// + + __m512 tau1 = _mm512_set1_ps(hh[0]); + x1 = _mm512_mul_ps(x1, tau1); + x2 = _mm512_mul_ps(x2, tau1); +// x3 = _mm512_mul_ps(x3, tau1); +// x4 = _mm512_mul_ps(x4, tau1); + + __m512 tau2 = _mm512_set1_ps(hh[ldh]); + __m512 vs_1_2 = _mm512_set1_ps(scalarprods[0]); + h2 = _mm512_mul_ps(tau2, vs_1_2); + + y1 = _mm512_FMSUB_ps(y1, tau2, _mm512_mul_ps(x1,h2)); + y2 = _mm512_FMSUB_ps(y2, tau2, _mm512_mul_ps(x2,h2)); +// y3 = _mm512_FMSUB_ps(y3, tau2, _mm512_mul_ps(x3,h2)); +// y4 = _mm512_FMSUB_ps(y4, tau2, _mm512_mul_ps(x4,h2)); + + __m512 tau3 = _mm512_set1_ps(hh[ldh*2]); + __m512 vs_1_3 = _mm512_set1_ps(scalarprods[1]); + __m512 vs_2_3 = _mm512_set1_ps(scalarprods[2]); + + h2 = _mm512_mul_ps(tau3, vs_1_3); + h3 = _mm512_mul_ps(tau3, vs_2_3); + + z1 = _mm512_FMSUB_ps(z1, tau3, _mm512_FMA_ps(y1, h3, _mm512_mul_ps(x1,h2))); + z2 = _mm512_FMSUB_ps(z2, tau3, _mm512_FMA_ps(y2, h3, _mm512_mul_ps(x2,h2))); +// z3 = _mm512_FMSUB_ps(z3, tau3, _mm512_FMA_ps(y3, h3, _mm512_mul_ps(x3,h2))); +// z4 = _mm512_FMSUB_ps(z4, tau3, _mm512_FMA_ps(y4, h3, _mm512_mul_ps(x4,h2))); + + __m512 tau4 = _mm512_set1_ps(hh[ldh*3]); + __m512 vs_1_4 = _mm512_set1_ps(scalarprods[3]); + __m512 vs_2_4 = _mm512_set1_ps(scalarprods[4]); + + h2 = _mm512_mul_ps(tau4, vs_1_4); + h3 = _mm512_mul_ps(tau4, vs_2_4); + + __m512 vs_3_4 = _mm512_set1_ps(scalarprods[5]); + h4 = _mm512_mul_ps(tau4, vs_3_4); + + w1 = _mm512_FMSUB_ps(w1, tau4, _mm512_FMA_ps(z1, h4, _mm512_FMA_ps(y1, h3, _mm512_mul_ps(x1,h2)))); + w2 = _mm512_FMSUB_ps(w2, tau4, _mm512_FMA_ps(z2, h4, _mm512_FMA_ps(y2, h3, _mm512_mul_ps(x2,h2)))); +// w3 = _mm512_FMSUB_ps(w3, tau4, _mm512_FMA_ps(z3, h4, _mm512_FMA_ps(y3, h3, _mm512_mul_ps(x3,h2)))); +// w4 = _mm512_FMSUB_ps(w4, tau4, _mm512_FMA_ps(z4, h4, _mm512_FMA_ps(y4, h3, _mm512_mul_ps(x4,h2)))); + + __m512 tau5 = _mm512_set1_ps(hh[ldh*4]); + __m512 vs_1_5 = _mm512_set1_ps(scalarprods[6]); + __m512 vs_2_5 = _mm512_set1_ps(scalarprods[7]); + + h2 = _mm512_mul_ps(tau5, vs_1_5); + h3 = _mm512_mul_ps(tau5, vs_2_5); + + __m512 vs_3_5 = _mm512_set1_ps(scalarprods[8]); + __m512 vs_4_5 = _mm512_set1_ps(scalarprods[9]); + + h4 = _mm512_mul_ps(tau5, vs_3_5); + h5 = _mm512_mul_ps(tau5, vs_4_5); + + v1 = _mm512_FMSUB_ps(v1, tau5, _mm512_add_ps(_mm512_FMA_ps(w1, h5, _mm512_mul_ps(z1,h4)), _mm512_FMA_ps(y1, h3, _mm512_mul_ps(x1,h2)))); + v2 = _mm512_FMSUB_ps(v2, tau5, _mm512_add_ps(_mm512_FMA_ps(w2, h5, _mm512_mul_ps(z2,h4)), _mm512_FMA_ps(y2, h3, _mm512_mul_ps(x2,h2)))); +// v3 = _mm512_FMSUB_ps(v3, tau5, _mm512_add_ps(_mm512_FMA_ps(w3, h5, _mm512_mul_ps(z3,h4)), _mm512_FMA_ps(y3, h3, _mm512_mul_ps(x3,h2)))); +// v4 = _mm512_FMSUB_ps(v4, tau5, _mm512_add_ps(_mm512_FMA_ps(w4, h5, _mm512_mul_ps(z4,h4)), _mm512_FMA_ps(y4, h3, _mm512_mul_ps(x4,h2)))); + + __m512 tau6 = _mm512_set1_ps(hh[ldh*5]); + __m512 vs_1_6 = _mm512_set1_ps(scalarprods[10]); + __m512 vs_2_6 = _mm512_set1_ps(scalarprods[11]); + h2 = _mm512_mul_ps(tau6, vs_1_6); + h3 = _mm512_mul_ps(tau6, vs_2_6); + + __m512 vs_3_6 = _mm512_set1_ps(scalarprods[12]); + __m512 vs_4_6 = _mm512_set1_ps(scalarprods[13]); + __m512 vs_5_6 = _mm512_set1_ps(scalarprods[14]); + + h4 = _mm512_mul_ps(tau6, vs_3_6); + h5 = _mm512_mul_ps(tau6, vs_4_6); + h6 = _mm512_mul_ps(tau6, vs_5_6); + + t1 = _mm512_FMSUB_ps(t1, tau6, _mm512_FMA_ps(v1, h6, _mm512_add_ps(_mm512_FMA_ps(w1, h5, _mm512_mul_ps(z1,h4)), _mm512_FMA_ps(y1, h3, _mm512_mul_ps(x1,h2))))); + t2 = _mm512_FMSUB_ps(t2, tau6, _mm512_FMA_ps(v2, h6, _mm512_add_ps(_mm512_FMA_ps(w2, h5, _mm512_mul_ps(z2,h4)), _mm512_FMA_ps(y2, h3, _mm512_mul_ps(x2,h2))))); +// t3 = _mm512_FMSUB_ps(t3, tau6, _mm512_FMA_ps(v3, h6, _mm512_add_ps(_mm512_FMA_ps(w3, h5, _mm512_mul_ps(z3,h4)), _mm512_FMA_ps(y3, h3, _mm512_mul_ps(x3,h2))))); +// t4 = _mm512_FMSUB_ps(t4, tau6, _mm512_FMA_ps(v4, h6, _mm512_add_ps(_mm512_FMA_ps(w4, h5, _mm512_mul_ps(z4,h4)), _mm512_FMA_ps(y4, h3, _mm512_mul_ps(x4,h2))))); + + + ///////////////////////////////////////////////////// + // Rank-1 update of Q [8 x nb+3] + ///////////////////////////////////////////////////// + + q1 = _mm512_load_ps(&q[0]); + q2 = _mm512_load_ps(&q[0+16]); +// q3 = _mm512_load_ps(&q[0+32]); +// q4 = _mm512_load_ps(&q[0+48]); + + q1 = _mm512_sub_ps(q1, t1); + q2 = _mm512_sub_ps(q2, t2); +// q3 = _mm512_sub_ps(q3, t3); +// q4 = _mm512_sub_ps(q4, t4); + + _mm512_store_ps(&q[0],q1); + _mm512_store_ps(&q[0+16],q2); +// _mm512_store_ps(&q[0+32],q3); +// _mm512_store_ps(&q[0+48],q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+1]); + q1 = _mm512_load_ps(&q[ldq]); + q2 = _mm512_load_ps(&q[ldq+16]); +// q3 = _mm512_load_ps(&q[ldq+32]); +// q4 = _mm512_load_ps(&q[ldq+48]); + + q1 = _mm512_sub_ps(q1, v1); + q2 = _mm512_sub_ps(q2, v2); +// q3 = _mm512_sub_ps(q3, v3); +// q4 = _mm512_sub_ps(q4, v4); + + q1 = _mm512_NFMA_ps(t1, h6, q1); + q2 = _mm512_NFMA_ps(t2, h6, q2); +// q3 = _mm512_NFMA_ps(t3, h6, q3); +// q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[ldq],q1); + _mm512_store_ps(&q[ldq+16],q2); +// _mm512_store_ps(&q[ldq+32],q3); +// _mm512_store_ps(&q[ldq+48],q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+1]); + q1 = _mm512_load_ps(&q[ldq*2]); + q2 = _mm512_load_ps(&q[(ldq*2)+16]); +// q3 = _mm512_load_ps(&q[(ldq*2)+32]); +// q4 = _mm512_load_ps(&q[(ldq*2)+48]); + + q1 = _mm512_sub_ps(q1, w1); + q2 = _mm512_sub_ps(q2, w2); +// q3 = _mm512_sub_ps(q3, w3); +// q4 = _mm512_sub_ps(q4, w4); + + q1 = _mm512_NFMA_ps(v1, h5, q1); + q2 = _mm512_NFMA_ps(v2, h5, q2); +// q3 = _mm512_NFMA_ps(v3, h5, q3); +// q4 = _mm512_NFMA_ps(v4, h5, q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+2]); + + q1 = _mm512_NFMA_ps(t1, h6, q1); + q2 = _mm512_NFMA_ps(t2, h6, q2); +// q3 = _mm512_NFMA_ps(t3, h6, q3); +// q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[ldq*2],q1); + _mm512_store_ps(&q[(ldq*2)+16],q2); +// _mm512_store_ps(&q[(ldq*2)+32],q3); +// _mm512_store_ps(&q[(ldq*2)+48],q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+1]); + q1 = _mm512_load_ps(&q[ldq*3]); + q2 = _mm512_load_ps(&q[(ldq*3)+16]); +// q3 = _mm512_load_ps(&q[(ldq*3)+32]); +// q4 = _mm512_load_ps(&q[(ldq*3)+48]); + + q1 = _mm512_sub_ps(q1, z1); + q2 = _mm512_sub_ps(q2, z2); +// q3 = _mm512_sub_ps(q3, z3); +// q4 = _mm512_sub_ps(q4, z4); + + q1 = _mm512_NFMA_ps(w1, h4, q1); + q2 = _mm512_NFMA_ps(w2, h4, q2); +// q3 = _mm512_NFMA_ps(w3, h4, q3); +// q4 = _mm512_NFMA_ps(w4, h4, q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+2]); + + q1 = _mm512_NFMA_ps(v1, h5, q1); + q2 = _mm512_NFMA_ps(v2, h5, q2); +// q3 = _mm512_NFMA_ps(v3, h5, q3); +// q4 = _mm512_NFMA_ps(v4, h5, q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+3]); + + q1 = _mm512_NFMA_ps(t1, h6, q1); + q2 = _mm512_NFMA_ps(t2, h6, q2); +// q3 = _mm512_NFMA_ps(t3, h6, q3); +// q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[ldq*3],q1); + _mm512_store_ps(&q[(ldq*3)+16],q2); +// _mm512_store_ps(&q[(ldq*3)+32],q3); +// _mm512_store_ps(&q[(ldq*3)+48],q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+1]); + q1 = _mm512_load_ps(&q[ldq*4]); + q2 = _mm512_load_ps(&q[(ldq*4)+16]); +// q3 = _mm512_load_ps(&q[(ldq*4)+32]); +// q4 = _mm512_load_ps(&q[(ldq*4)+48]); + + q1 = _mm512_sub_ps(q1, y1); + q2 = _mm512_sub_ps(q2, y2); +// q3 = _mm512_sub_ps(q3, y3); +// q4 = _mm512_sub_ps(q4, y4); + + q1 = _mm512_NFMA_ps(z1, h3, q1); + q2 = _mm512_NFMA_ps(z2, h3, q2); +// q3 = _mm512_NFMA_ps(z3, h3, q3); +// q4 = _mm512_NFMA_ps(z4, h3, q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+2]); + + q1 = _mm512_NFMA_ps(w1, h4, q1); + q2 = _mm512_NFMA_ps(w2, h4, q2); +// q3 = _mm512_NFMA_ps(w3, h4, q3); +// q4 = _mm512_NFMA_ps(w4, h4, q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+3]); + + q1 = _mm512_NFMA_ps(v1, h5, q1); + q2 = _mm512_NFMA_ps(v2, h5, q2); +// q3 = _mm512_NFMA_ps(v3, h5, q3); +// q4 = _mm512_NFMA_ps(v4, h5, q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+4]); + + q1 = _mm512_NFMA_ps(t1, h6, q1); + q2 = _mm512_NFMA_ps(t2, h6, q2); +// q3 = _mm512_NFMA_ps(t3, h6, q3); +// q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[ldq*4],q1); + _mm512_store_ps(&q[(ldq*4)+16],q2); +// _mm512_store_ps(&q[(ldq*4)+32],q3); +// _mm512_store_ps(&q[(ldq*4)+48],q4); + + h2 = _mm512_set1_ps(hh[(ldh)+1]); + q1 = _mm512_load_ps(&q[ldq*5]); + q2 = _mm512_load_ps(&q[(ldq*5)+16]); +// q3 = _mm512_load_ps(&q[(ldq*5)+32]); +// q4 = _mm512_load_ps(&q[(ldq*5)+48]); + + q1 = _mm512_sub_ps(q1, x1); + q2 = _mm512_sub_ps(q2, x2); +// q3 = _mm512_sub_ps(q3, x3); +// q4 = _mm512_sub_ps(q4, x4); + + q1 = _mm512_NFMA_ps(y1, h2, q1); + q2 = _mm512_NFMA_ps(y2, h2, q2); +// q3 = _mm512_NFMA_ps(y3, h2, q3); +// q4 = _mm512_NFMA_ps(y4, h2, q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+2]); + + q1 = _mm512_NFMA_ps(z1, h3, q1); + q2 = _mm512_NFMA_ps(z2, h3, q2); +// q3 = _mm512_NFMA_ps(z3, h3, q3); +// q4 = _mm512_NFMA_ps(z4, h3, q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+3]); + + q1 = _mm512_NFMA_ps(w1, h4, q1); + q2 = _mm512_NFMA_ps(w2, h4, q2); +// q3 = _mm512_NFMA_ps(w3, h4, q3); +// q4 = _mm512_NFMA_ps(w4, h4, q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+4]); + + q1 = _mm512_NFMA_ps(v1, h5, q1); + q2 = _mm512_NFMA_ps(v2, h5, q2); +// q3 = _mm512_NFMA_ps(v3, h5, q3); +// q4 = _mm512_NFMA_ps(v4, h5, q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+5]); + + q1 = _mm512_NFMA_ps(t1, h6, q1); + q2 = _mm512_NFMA_ps(t2, h6, q2); +// q3 = _mm512_NFMA_ps(t3, h6, q3); +// q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[ldq*5],q1); + _mm512_store_ps(&q[(ldq*5)+16],q2); +// _mm512_store_ps(&q[(ldq*5)+32],q3); +// _mm512_store_ps(&q[(ldq*5)+48],q4); + + for (i = 6; i < nb; i++) + { + q1 = _mm512_load_ps(&q[i*ldq]); + q2 = _mm512_load_ps(&q[(i*ldq)+16]); +// q3 = _mm512_load_ps(&q[(i*ldq)+32]); +// q4 = _mm512_load_ps(&q[(i*ldq)+48]); + + h1 = _mm512_set1_ps(hh[i-5]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); + q2 = _mm512_NFMA_ps(x2, h1, q2); +// q3 = _mm512_NFMA_ps(x3, h1, q3); +// q4 = _mm512_NFMA_ps(x4, h1, q4); + + h2 = _mm512_set1_ps(hh[ldh+i-4]); + + q1 = _mm512_NFMA_ps(y1, h2, q1); + q2 = _mm512_NFMA_ps(y2, h2, q2); +// q3 = _mm512_NFMA_ps(y3, h2, q3); + // q4 = _mm512_NFMA_ps(y4, h2, q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+i-3]); + + q1 = _mm512_NFMA_ps(z1, h3, q1); + q2 = _mm512_NFMA_ps(z2, h3, q2); +// q3 = _mm512_NFMA_ps(z3, h3, q3); +// q4 = _mm512_NFMA_ps(z4, h3, q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+i-2]); + + q1 = _mm512_NFMA_ps(w1, h4, q1); + q2 = _mm512_NFMA_ps(w2, h4, q2); +// q3 = _mm512_NFMA_ps(w3, h4, q3); +// q4 = _mm512_NFMA_ps(w4, h4, q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+i-1]); + + q1 = _mm512_NFMA_ps(v1, h5, q1); + q2 = _mm512_NFMA_ps(v2, h5, q2); +// q3 = _mm512_NFMA_ps(v3, h5, q3); +// q4 = _mm512_NFMA_ps(v4, h5, q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+i]); + + q1 = _mm512_NFMA_ps(t1, h6, q1); + q2 = _mm512_NFMA_ps(t2, h6, q2); +// q3 = _mm512_NFMA_ps(t3, h6, q3); +// q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[i*ldq],q1); + _mm512_store_ps(&q[(i*ldq)+16],q2); +// _mm512_store_ps(&q[(i*ldq)+32],q3); +// _mm512_store_ps(&q[(i*ldq)+48],q4); + + } + + h1 = _mm512_set1_ps(hh[nb-5]); + q1 = _mm512_load_ps(&q[nb*ldq]); + q2 = _mm512_load_ps(&q[(nb*ldq)+16]); +// q3 = _mm512_load_ps(&q[(nb*ldq)+32]); +// q4 = _mm512_load_ps(&q[(nb*ldq)+48]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); + q2 = _mm512_NFMA_ps(x2, h1, q2); +// q3 = _mm512_NFMA_ps(x3, h1, q3); +// q4 = _mm512_NFMA_ps(x4, h1, q4); + + h2 = _mm512_set1_ps(hh[ldh+nb-4]); + + q1 = _mm512_NFMA_ps(y1, h2, q1); + q2 = _mm512_NFMA_ps(y2, h2, q2); +// q3 = _mm512_NFMA_ps(y3, h2, q3); +// q4 = _mm512_NFMA_ps(y4, h2, q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-3]); + + q1 = _mm512_NFMA_ps(z1, h3, q1); + q2 = _mm512_NFMA_ps(z2, h3, q2); +// q3 = _mm512_NFMA_ps(z3, h3, q3); +// q4 = _mm512_NFMA_ps(z4, h3, q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+nb-2]); + + q1 = _mm512_NFMA_ps(w1, h4, q1); + q2 = _mm512_NFMA_ps(w2, h4, q2); +// q3 = _mm512_NFMA_ps(w3, h4, q3); +// q4 = _mm512_NFMA_ps(w4, h4, q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+nb-1]); + + q1 = _mm512_NFMA_ps(v1, h5, q1); + q2 = _mm512_NFMA_ps(v2, h5, q2); +// q3 = _mm512_NFMA_ps(v3, h5, q3); +// q4 = _mm512_NFMA_ps(v4, h5, q4); + + _mm512_store_ps(&q[nb*ldq],q1); + _mm512_store_ps(&q[(nb*ldq)+16],q2); +// _mm512_store_ps(&q[(nb*ldq)+32],q3); +// _mm512_store_ps(&q[(nb*ldq)+48],q4); + + h1 = _mm512_set1_ps(hh[nb-4]); + q1 = _mm512_load_ps(&q[(nb+1)*ldq]); + q2 = _mm512_load_ps(&q[((nb+1)*ldq)+16]); +// q3 = _mm512_load_ps(&q[((nb+1)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+1)*ldq)+48]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); + q2 = _mm512_NFMA_ps(x2, h1, q2); +// q3 = _mm512_NFMA_ps(x3, h1, q3); +// q4 = _mm512_NFMA_ps(x4, h1, q4); + + h2 = _mm512_set1_ps(hh[ldh+nb-3]); + + q1 = _mm512_NFMA_ps(y1, h2, q1); + q2 = _mm512_NFMA_ps(y2, h2, q2); +// q3 = _mm512_NFMA_ps(y3, h2, q3); +// q4 = _mm512_NFMA_ps(y4, h2, q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-2]); + + q1 = _mm512_NFMA_ps(z1, h3, q1); + q2 = _mm512_NFMA_ps(z2, h3, q2); +// q3 = _mm512_NFMA_ps(z3, h3, q3); +// q4 = _mm512_NFMA_ps(z4, h3, q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+nb-1]); + + q1 = _mm512_NFMA_ps(w1, h4, q1); + q2 = _mm512_NFMA_ps(w2, h4, q2); +// q3 = _mm512_NFMA_ps(w3, h4, q3); +// q4 = _mm512_NFMA_ps(w4, h4, q4); + + _mm512_store_ps(&q[(nb+1)*ldq],q1); + _mm512_store_ps(&q[((nb+1)*ldq)+16],q2); +// _mm512_store_ps(&q[((nb+1)*ldq)+32],q3); +// _mm512_store_ps(&q[((nb+1)*ldq)+48],q4); + + h1 = _mm512_set1_ps(hh[nb-3]); + q1 = _mm512_load_ps(&q[(nb+2)*ldq]); + q2 = _mm512_load_ps(&q[((nb+2)*ldq)+16]); +// q3 = _mm512_load_ps(&q[((nb+2)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+2)*ldq)+48]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); + q2 = _mm512_NFMA_ps(x2, h1, q2); +// q3 = _mm512_NFMA_ps(x3, h1, q3); +// q4 = _mm512_NFMA_ps(x4, h1, q4); + + h2 = _mm512_set1_ps(hh[ldh+nb-2]); + + q1 = _mm512_NFMA_ps(y1, h2, q1); + q2 = _mm512_NFMA_ps(y2, h2, q2); +// q3 = _mm512_NFMA_ps(y3, h2, q3); +// q4 = _mm512_NFMA_ps(y4, h2, q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-1]); + + q1 = _mm512_NFMA_ps(z1, h3, q1); + q2 = _mm512_NFMA_ps(z2, h3, q2); +// q3 = _mm512_NFMA_ps(z3, h3, q3); +// q4 = _mm512_NFMA_ps(z4, h3, q4); + + _mm512_store_ps(&q[(nb+2)*ldq],q1); + _mm512_store_ps(&q[((nb+2)*ldq)+16],q2); +// _mm512_store_ps(&q[((nb+2)*ldq)+32],q3); +// _mm512_store_ps(&q[((nb+2)*ldq)+48],q4); + + h1 = _mm512_set1_ps(hh[nb-2]); + q1 = _mm512_load_ps(&q[(nb+3)*ldq]); + q2 = _mm512_load_ps(&q[((nb+3)*ldq)+16]); +// q3 = _mm512_load_ps(&q[((nb+3)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+3)*ldq)+48]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); + q2 = _mm512_NFMA_ps(x2, h1, q2); +// q3 = _mm512_NFMA_ps(x3, h1, q3); +// q4 = _mm512_NFMA_ps(x4, h1, q4); + + h2 = _mm512_set1_ps(hh[ldh+nb-1]); + + q1 = _mm512_NFMA_ps(y1, h2, q1); + q2 = _mm512_NFMA_ps(y2, h2, q2); +// q3 = _mm512_NFMA_ps(y3, h2, q3); +// q4 = _mm512_NFMA_ps(y4, h2, q4); + + _mm512_store_ps(&q[(nb+3)*ldq],q1); + _mm512_store_ps(&q[((nb+3)*ldq)+16],q2); +// _mm512_store_ps(&q[((nb+3)*ldq)+32],q3); +// _mm512_store_ps(&q[((nb+3)*ldq)+48],q4); + + h1 = _mm512_set1_ps(hh[nb-1]); + q1 = _mm512_load_ps(&q[(nb+4)*ldq]); + q2 = _mm512_load_ps(&q[((nb+4)*ldq)+16]); +// q3 = _mm512_load_ps(&q[((nb+4)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+4)*ldq)+48]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); + q2 = _mm512_NFMA_ps(x2, h1, q2); +// q3 = _mm512_NFMA_ps(x3, h1, q3); +// q4 = _mm512_NFMA_ps(x4, h1, q4); + + _mm512_store_ps(&q[(nb+4)*ldq],q1); + _mm512_store_ps(&q[((nb+4)*ldq)+16],q2); +// _mm512_store_ps(&q[((nb+4)*ldq)+32],q3); +// _mm512_store_ps(&q[((nb+4)*ldq)+48],q4); + +} + + +/** + * Unrolled kernel that computes + * 16 rows of Q simultaneously, a + * matrix vector product with two householder + * vectors + a rank 1 update is performed + */ +__forceinline void hh_trafo_kernel_16_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods) +{ + ///////////////////////////////////////////////////// + // Matrix Vector Multiplication, Q [8 x nb+3] * hh + // hh contains four householder vectors + ///////////////////////////////////////////////////// + int i; + + __m512 a1_1 = _mm512_load_ps(&q[ldq*5]); + __m512 a2_1 = _mm512_load_ps(&q[ldq*4]); + __m512 a3_1 = _mm512_load_ps(&q[ldq*3]); + __m512 a4_1 = _mm512_load_ps(&q[ldq*2]); + __m512 a5_1 = _mm512_load_ps(&q[ldq]); + __m512 a6_1 = _mm512_load_ps(&q[0]); + + __m512 h_6_5 = _mm512_set1_ps(hh[(ldh*5)+1]); + __m512 h_6_4 = _mm512_set1_ps(hh[(ldh*5)+2]); + __m512 h_6_3 = _mm512_set1_ps(hh[(ldh*5)+3]); + __m512 h_6_2 = _mm512_set1_ps(hh[(ldh*5)+4]); + __m512 h_6_1 = _mm512_set1_ps(hh[(ldh*5)+5]); + +// register __m512d t1 = _mm512_FMA_ps(a5_1, h_6_5, a6_1); + __m512 t1 = _mm512_FMA_ps(a5_1, h_6_5, a6_1); + + t1 = _mm512_FMA_ps(a4_1, h_6_4, t1); + t1 = _mm512_FMA_ps(a3_1, h_6_3, t1); + t1 = _mm512_FMA_ps(a2_1, h_6_2, t1); + t1 = _mm512_FMA_ps(a1_1, h_6_1, t1); + + __m512 h_5_4 = _mm512_set1_ps(hh[(ldh*4)+1]); + __m512 h_5_3 = _mm512_set1_ps(hh[(ldh*4)+2]); + __m512 h_5_2 = _mm512_set1_ps(hh[(ldh*4)+3]); + __m512 h_5_1 = _mm512_set1_ps(hh[(ldh*4)+4]); + +// register __m512d v1 = _mm512_FMA_ps(a4_1, h_5_4, a5_1); + __m512 v1 = _mm512_FMA_ps(a4_1, h_5_4, a5_1); + + v1 = _mm512_FMA_ps(a3_1, h_5_3, v1); + v1 = _mm512_FMA_ps(a2_1, h_5_2, v1); + v1 = _mm512_FMA_ps(a1_1, h_5_1, v1); + + __m512 h_4_3 = _mm512_set1_ps(hh[(ldh*3)+1]); + __m512 h_4_2 = _mm512_set1_ps(hh[(ldh*3)+2]); + __m512 h_4_1 = _mm512_set1_ps(hh[(ldh*3)+3]); + +// register __m512d w1 = _mm512_FMA_ps(a3_1, h_4_3, a4_1); + __m512 w1 = _mm512_FMA_ps(a3_1, h_4_3, a4_1); + + w1 = _mm512_FMA_ps(a2_1, h_4_2, w1); + w1 = _mm512_FMA_ps(a1_1, h_4_1, w1); + + __m512 h_2_1 = _mm512_set1_ps(hh[ldh+1]); + __m512 h_3_2 = _mm512_set1_ps(hh[(ldh*2)+1]); + __m512 h_3_1 = _mm512_set1_ps(hh[(ldh*2)+2]); + +// register __m512d z1 = _mm512_FMA_ps(a2_1, h_3_2, a3_1); + __m512 z1 = _mm512_FMA_ps(a2_1, h_3_2, a3_1); + + z1 = _mm512_FMA_ps(a1_1, h_3_1, z1); +// register __m512d y1 = _mm512_FMA_ps(a1_1, h_2_1, a2_1); + __m512 y1 = _mm512_FMA_ps(a1_1, h_2_1, a2_1); + + +// register __m512d x1 = a1_1; + __m512 x1 = a1_1; + +// __m512 a1_2 = _mm512_load_ps(&q[(ldq*5)+16]); +// __m512 a2_2 = _mm512_load_ps(&q[(ldq*4)+16]); +// __m512 a3_2 = _mm512_load_ps(&q[(ldq*3)+16]); +// __m512 a4_2 = _mm512_load_ps(&q[(ldq*2)+16]); +// __m512 a5_2 = _mm512_load_ps(&q[(ldq)+16]); +// __m512 a6_2 = _mm512_load_ps(&q[0+16]); +// +//// register __m512d t2 = _mm512_FMA_ps(a5_2, h_6_5, a6_2); +// __m512 t2 = _mm512_FMA_ps(a5_2, h_6_5, a6_2); +// +// t2 = _mm512_FMA_ps(a4_2, h_6_4, t2); +// t2 = _mm512_FMA_ps(a3_2, h_6_3, t2); +// t2 = _mm512_FMA_ps(a2_2, h_6_2, t2); +// t2 = _mm512_FMA_ps(a1_2, h_6_1, t2); +// +//// register __m512d v2 = _mm512_FMA_ps(a4_2, h_5_4, a5_2); +// __m512 v2 = _mm512_FMA_ps(a4_2, h_5_4, a5_2); +// +// v2 = _mm512_FMA_ps(a3_2, h_5_3, v2); +// v2 = _mm512_FMA_ps(a2_2, h_5_2, v2); +// v2 = _mm512_FMA_ps(a1_2, h_5_1, v2); +// +//// register __m512d w2 = _mm512_FMA_ps(a3_2, h_4_3, a4_2); +// __m512 w2 = _mm512_FMA_ps(a3_2, h_4_3, a4_2); +// +// w2 = _mm512_FMA_ps(a2_2, h_4_2, w2); +// w2 = _mm512_FMA_ps(a1_2, h_4_1, w2); +// +//// register __m512d z2 = _mm512_FMA_ps(a2_2, h_3_2, a3_2); +// __m512 z2 = _mm512_FMA_ps(a2_2, h_3_2, a3_2); +// +// z2 = _mm512_FMA_ps(a1_2, h_3_1, z2); +//// register __m512d y2 = _mm512_FMA_ps(a1_2, h_2_1, a2_2); +// __m512 y2 = _mm512_FMA_ps(a1_2, h_2_1, a2_2); +// +// +//// register __m512d x2 = a1_2; +// __m512 x2 = a1_2; + +// __m512 a1_3 = _mm512_load_ps(&q[(ldq*5)+32]); +// __m512 a2_3 = _mm512_load_ps(&q[(ldq*4)+32]); +// __m512 a3_3 = _mm512_load_ps(&q[(ldq*3)+32]); +// __m512 a4_3 = _mm512_load_ps(&q[(ldq*2)+32]); +// __m512 a5_3 = _mm512_load_ps(&q[(ldq)+32]); +// __m512 a6_3 = _mm512_load_ps(&q[0+32]); +// +//// register __m512d t3 = _mm512_FMA_ps(a5_3, h_6_5, a6_3); +// __m512 t3 = _mm512_FMA_ps(a5_3, h_6_5, a6_3); +// +// t3 = _mm512_FMA_ps(a4_3, h_6_4, t3); +// t3 = _mm512_FMA_ps(a3_3, h_6_3, t3); +// t3 = _mm512_FMA_ps(a2_3, h_6_2, t3); +// t3 = _mm512_FMA_ps(a1_3, h_6_1, t3); +// +//// register __m512d v3 = _mm512_FMA_ps(a4_3, h_5_4, a5_3); +// __m512 v3 = _mm512_FMA_ps(a4_3, h_5_4, a5_3); +// +// v3 = _mm512_FMA_ps(a3_3, h_5_3, v3); +// v3 = _mm512_FMA_ps(a2_3, h_5_2, v3); +// v3 = _mm512_FMA_ps(a1_3, h_5_1, v3); +// +//// register __m512d w3 = _mm512_FMA_ps(a3_3, h_4_3, a4_3); +// __m512 w3 = _mm512_FMA_ps(a3_3, h_4_3, a4_3); +// +// w3 = _mm512_FMA_ps(a2_3, h_4_2, w3); +// w3 = _mm512_FMA_ps(a1_3, h_4_1, w3); +// +//// register __m512d z3 = _mm512_FMA_ps(a2_3, h_3_2, a3_3); +// __m512 z3 = _mm512_FMA_ps(a2_3, h_3_2, a3_3); +// +// z3 = _mm512_FMA_ps(a1_3, h_3_1, z3); +//// register __m512d y3 = _mm512_FMA_ps(a1_3, h_2_1, a2_3); +// __m512 y3 = _mm512_FMA_ps(a1_3, h_2_1, a2_3); +// +// +//// register __m512d x3 = a1_3; +// __m512 x3 = a1_3; + + +// __m512 a1_4 = _mm512_load_ps(&q[(ldq*5)+48]); +// __m512 a2_4 = _mm512_load_ps(&q[(ldq*4)+48]); +// __m512 a3_4 = _mm512_load_ps(&q[(ldq*3)+48]); +// __m512 a4_4 = _mm512_load_ps(&q[(ldq*2)+48]); +// __m512 a5_4 = _mm512_load_ps(&q[(ldq)+48]); +// __m512 a6_4 = _mm512_load_ps(&q[0+48]); +// +//// register __m512d t4 = _mm512_FMA_ps(a5_4, h_6_5, a6_4); + // __m512 t4 = _mm512_FMA_ps(a5_4, h_6_5, a6_4); +// +// t4 = _mm512_FMA_ps(a4_4, h_6_4, t4); +// t4 = _mm512_FMA_ps(a3_4, h_6_3, t4); +// t4 = _mm512_FMA_ps(a2_4, h_6_2, t4); +// t4 = _mm512_FMA_ps(a1_4, h_6_1, t4); +// +//// register __m512d v4 = _mm512_FMA_ps(a4_4, h_5_4, a5_4); +// __m512 v4 = _mm512_FMA_ps(a4_4, h_5_4, a5_4); +// +// v4 = _mm512_FMA_ps(a3_4, h_5_3, v4); +// v4 = _mm512_FMA_ps(a2_4, h_5_2, v4); +// v4 = _mm512_FMA_ps(a1_4, h_5_1, v4); +// +//// register __m512d w4 = _mm512_FMA_ps(a3_4, h_4_3, a4_4); + // __m512 w4 = _mm512_FMA_ps(a3_4, h_4_3, a4_4); +// +// w4 = _mm512_FMA_ps(a2_4, h_4_2, w4); +// w4 = _mm512_FMA_ps(a1_4, h_4_1, w4); +// +//// register __m512d z4 = _mm512_FMA_ps(a2_4, h_3_2, a3_4); +// __m512 z4 = _mm512_FMA_ps(a2_4, h_3_2, a3_4); +// +// z4 = _mm512_FMA_ps(a1_4, h_3_1, z4); +//// register __m512d y4 = _mm512_FMA_ps(a1_4, h_2_1, a2_4); +// __m512 y4 = _mm512_FMA_ps(a1_4, h_2_1, a2_4); +// +// +//// register __m512d x4 = a1_4; +// __m512 x4 = a1_4; + + + __m512 q1; +// __m512 q2; +// __m512 q3; +// __m512 q4; + + __m512 h1; + __m512 h2; + __m512 h3; + __m512 h4; + __m512 h5; + __m512 h6; + + for(i = 6; i < nb; i++) + { + h1 = _mm512_set1_ps(hh[i-5]); + q1 = _mm512_load_ps(&q[i*ldq]); +// q2 = _mm512_load_ps(&q[(i*ldq)+16]); +// q3 = _mm512_load_ps(&q[(i*ldq)+32]); +// q4 = _mm512_load_ps(&q[(i*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); +// x2 = _mm512_FMA_ps(q2, h1, x2); +// x3 = _mm512_FMA_ps(q3, h1, x3); +// x4 = _mm512_FMA_ps(q4, h1, x4); + + h2 = _mm512_set1_ps(hh[ldh+i-4]); + + y1 = _mm512_FMA_ps(q1, h2, y1); +// y2 = _mm512_FMA_ps(q2, h2, y2); +// y3 = _mm512_FMA_ps(q3, h2, y3); +// y4 = _mm512_FMA_ps(q4, h2, y4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+i-3]); + + z1 = _mm512_FMA_ps(q1, h3, z1); +// z2 = _mm512_FMA_ps(q2, h3, z2); +// z3 = _mm512_FMA_ps(q3, h3, z3); +// z4 = _mm512_FMA_ps(q4, h3, z4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+i-2]); + + w1 = _mm512_FMA_ps(q1, h4, w1); +// w2 = _mm512_FMA_ps(q2, h4, w2); +// w3 = _mm512_FMA_ps(q3, h4, w3); +// w4 = _mm512_FMA_ps(q4, h4, w4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+i-1]); + + v1 = _mm512_FMA_ps(q1, h5, v1); +// v2 = _mm512_FMA_ps(q2, h5, v2); +// v3 = _mm512_FMA_ps(q3, h5, v3); +// v4 = _mm512_FMA_ps(q4, h5, v4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+i]); + + t1 = _mm512_FMA_ps(q1, h6, t1); +// t2 = _mm512_FMA_ps(q2, h6, t2); +// t3 = _mm512_FMA_ps(q3, h6, t3); +// t4 = _mm512_FMA_ps(q4, h6, t4); + } + + h1 = _mm512_set1_ps(hh[nb-5]); + q1 = _mm512_load_ps(&q[nb*ldq]); +// q2 = _mm512_load_ps(&q[(nb*ldq)+16]); +// q3 = _mm512_load_ps(&q[(nb*ldq)+32]); +// q4 = _mm512_load_ps(&q[(nb*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); +// x2 = _mm512_FMA_ps(q2, h1, x2); +// x3 = _mm512_FMA_ps(q3, h1, x3); +// x4 = _mm512_FMA_ps(q4, h1, x4); + + h2 = _mm512_set1_ps(hh[ldh+nb-4]); + + y1 = _mm512_FMA_ps(q1, h2, y1); +// y2 = _mm512_FMA_ps(q2, h2, y2); +// y3 = _mm512_FMA_ps(q3, h2, y3); +// y4 = _mm512_FMA_ps(q4, h2, y4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-3]); + + z1 = _mm512_FMA_ps(q1, h3, z1); +// z2 = _mm512_FMA_ps(q2, h3, z2); +// z3 = _mm512_FMA_ps(q3, h3, z3); +// z4 = _mm512_FMA_ps(q4, h3, z4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+nb-2]); + + w1 = _mm512_FMA_ps(q1, h4, w1); +// w2 = _mm512_FMA_ps(q2, h4, w2); +// w3 = _mm512_FMA_ps(q3, h4, w3); +// w4 = _mm512_FMA_ps(q4, h4, w4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+nb-1]); + + v1 = _mm512_FMA_ps(q1, h5, v1); +// v2 = _mm512_FMA_ps(q2, h5, v2); +// v3 = _mm512_FMA_ps(q3, h5, v3); +// v4 = _mm512_FMA_ps(q4, h5, v4); + + h1 = _mm512_set1_ps(hh[nb-4]); + + q1 = _mm512_load_ps(&q[(nb+1)*ldq]); +// q2 = _mm512_load_ps(&q[((nb+1)*ldq)+16]); +// q3 = _mm512_load_ps(&q[((nb+1)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+1)*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); +// x2 = _mm512_FMA_ps(q2, h1, x2); +// x3 = _mm512_FMA_ps(q3, h1, x3); +// x4 = _mm512_FMA_ps(q4, h1, x4); + + h2 = _mm512_set1_ps(hh[ldh+nb-3]); + + y1 = _mm512_FMA_ps(q1, h2, y1); +// y2 = _mm512_FMA_ps(q2, h2, y2); +// y3 = _mm512_FMA_ps(q3, h2, y3); +// y4 = _mm512_FMA_ps(q4, h2, y4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-2]); + + z1 = _mm512_FMA_ps(q1, h3, z1); +// z2 = _mm512_FMA_ps(q2, h3, z2); +// z3 = _mm512_FMA_ps(q3, h3, z3); +// z4 = _mm512_FMA_ps(q4, h3, z4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+nb-1]); + + w1 = _mm512_FMA_ps(q1, h4, w1); +// w2 = _mm512_FMA_ps(q2, h4, w2); +// w3 = _mm512_FMA_ps(q3, h4, w3); +// w4 = _mm512_FMA_ps(q4, h4, w4); + + h1 = _mm512_set1_ps(hh[nb-3]); + q1 = _mm512_load_ps(&q[(nb+2)*ldq]); +// q2 = _mm512_load_ps(&q[((nb+2)*ldq)+16]); +// q3 = _mm512_load_ps(&q[((nb+2)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+2)*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); +// x2 = _mm512_FMA_ps(q2, h1, x2); +// x3 = _mm512_FMA_ps(q3, h1, x3); +// x4 = _mm512_FMA_ps(q4, h1, x4); + + h2 = _mm512_set1_ps(hh[ldh+nb-2]); + + y1 = _mm512_FMA_ps(q1, h2, y1); +// y2 = _mm512_FMA_ps(q2, h2, y2); +// y3 = _mm512_FMA_ps(q3, h2, y3); +// y4 = _mm512_FMA_ps(q4, h2, y4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-1]); + + z1 = _mm512_FMA_ps(q1, h3, z1); +// z2 = _mm512_FMA_ps(q2, h3, z2); +// z3 = _mm512_FMA_ps(q3, h3, z3); +// z4 = _mm512_FMA_ps(q4, h3, z4); + + h1 = _mm512_set1_ps(hh[nb-2]); + q1 = _mm512_load_ps(&q[(nb+3)*ldq]); +// q2 = _mm512_load_ps(&q[((nb+3)*ldq)+16]); +// q3 = _mm512_load_ps(&q[((nb+3)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+3)*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); +// x2 = _mm512_FMA_ps(q2, h1, x2); +// x3 = _mm512_FMA_ps(q3, h1, x3); +// x4 = _mm512_FMA_ps(q4, h1, x4); + + h2 = _mm512_set1_ps(hh[ldh+nb-1]); + + y1 = _mm512_FMA_ps(q1, h2, y1); +// y2 = _mm512_FMA_ps(q2, h2, y2); +// y3 = _mm512_FMA_ps(q3, h2, y3); +// y4 = _mm512_FMA_ps(q4, h2, y4); + + h1 = _mm512_set1_ps(hh[nb-1]); + q1 = _mm512_load_ps(&q[(nb+4)*ldq]); +// q2 = _mm512_load_ps(&q[((nb+4)*ldq)+16]); +// q3 = _mm512_load_ps(&q[((nb+4)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+4)*ldq)+48]); + + x1 = _mm512_FMA_ps(q1, h1, x1); +// x2 = _mm512_FMA_ps(q2, h1, x2); +// x3 = _mm512_FMA_ps(q3, h1, x3); +// x4 = _mm512_FMA_ps(q4, h1, x4); + + ///////////////////////////////////////////////////// + // Apply tau, correct wrong calculation using pre-calculated scalar products + ///////////////////////////////////////////////////// + + __m512 tau1 = _mm512_set1_ps(hh[0]); + x1 = _mm512_mul_ps(x1, tau1); +// x2 = _mm512_mul_ps(x2, tau1); +// x3 = _mm512_mul_ps(x3, tau1); +// x4 = _mm512_mul_ps(x4, tau1); + + __m512 tau2 = _mm512_set1_ps(hh[ldh]); + __m512 vs_1_2 = _mm512_set1_ps(scalarprods[0]); + h2 = _mm512_mul_ps(tau2, vs_1_2); + + y1 = _mm512_FMSUB_ps(y1, tau2, _mm512_mul_ps(x1,h2)); +// y2 = _mm512_FMSUB_ps(y2, tau2, _mm512_mul_ps(x2,h2)); +// y3 = _mm512_FMSUB_ps(y3, tau2, _mm512_mul_ps(x3,h2)); +// y4 = _mm512_FMSUB_ps(y4, tau2, _mm512_mul_ps(x4,h2)); + + __m512 tau3 = _mm512_set1_ps(hh[ldh*2]); + __m512 vs_1_3 = _mm512_set1_ps(scalarprods[1]); + __m512 vs_2_3 = _mm512_set1_ps(scalarprods[2]); + + h2 = _mm512_mul_ps(tau3, vs_1_3); + h3 = _mm512_mul_ps(tau3, vs_2_3); + + z1 = _mm512_FMSUB_ps(z1, tau3, _mm512_FMA_ps(y1, h3, _mm512_mul_ps(x1,h2))); +// z2 = _mm512_FMSUB_ps(z2, tau3, _mm512_FMA_ps(y2, h3, _mm512_mul_ps(x2,h2))); +// z3 = _mm512_FMSUB_ps(z3, tau3, _mm512_FMA_ps(y3, h3, _mm512_mul_ps(x3,h2))); +// z4 = _mm512_FMSUB_ps(z4, tau3, _mm512_FMA_ps(y4, h3, _mm512_mul_ps(x4,h2))); + + __m512 tau4 = _mm512_set1_ps(hh[ldh*3]); + __m512 vs_1_4 = _mm512_set1_ps(scalarprods[3]); + __m512 vs_2_4 = _mm512_set1_ps(scalarprods[4]); + + h2 = _mm512_mul_ps(tau4, vs_1_4); + h3 = _mm512_mul_ps(tau4, vs_2_4); + + __m512 vs_3_4 = _mm512_set1_ps(scalarprods[5]); + h4 = _mm512_mul_ps(tau4, vs_3_4); + + w1 = _mm512_FMSUB_ps(w1, tau4, _mm512_FMA_ps(z1, h4, _mm512_FMA_ps(y1, h3, _mm512_mul_ps(x1,h2)))); +// w2 = _mm512_FMSUB_ps(w2, tau4, _mm512_FMA_ps(z2, h4, _mm512_FMA_ps(y2, h3, _mm512_mul_ps(x2,h2)))); +// w3 = _mm512_FMSUB_ps(w3, tau4, _mm512_FMA_ps(z3, h4, _mm512_FMA_ps(y3, h3, _mm512_mul_ps(x3,h2)))); +// w4 = _mm512_FMSUB_ps(w4, tau4, _mm512_FMA_ps(z4, h4, _mm512_FMA_ps(y4, h3, _mm512_mul_ps(x4,h2)))); + + __m512 tau5 = _mm512_set1_ps(hh[ldh*4]); + __m512 vs_1_5 = _mm512_set1_ps(scalarprods[6]); + __m512 vs_2_5 = _mm512_set1_ps(scalarprods[7]); + + h2 = _mm512_mul_ps(tau5, vs_1_5); + h3 = _mm512_mul_ps(tau5, vs_2_5); + + __m512 vs_3_5 = _mm512_set1_ps(scalarprods[8]); + __m512 vs_4_5 = _mm512_set1_ps(scalarprods[9]); + + h4 = _mm512_mul_ps(tau5, vs_3_5); + h5 = _mm512_mul_ps(tau5, vs_4_5); + + v1 = _mm512_FMSUB_ps(v1, tau5, _mm512_add_ps(_mm512_FMA_ps(w1, h5, _mm512_mul_ps(z1,h4)), _mm512_FMA_ps(y1, h3, _mm512_mul_ps(x1,h2)))); +// v2 = _mm512_FMSUB_ps(v2, tau5, _mm512_add_ps(_mm512_FMA_ps(w2, h5, _mm512_mul_ps(z2,h4)), _mm512_FMA_ps(y2, h3, _mm512_mul_ps(x2,h2)))); +// v3 = _mm512_FMSUB_ps(v3, tau5, _mm512_add_ps(_mm512_FMA_ps(w3, h5, _mm512_mul_ps(z3,h4)), _mm512_FMA_ps(y3, h3, _mm512_mul_ps(x3,h2)))); +// v4 = _mm512_FMSUB_ps(v4, tau5, _mm512_add_ps(_mm512_FMA_ps(w4, h5, _mm512_mul_ps(z4,h4)), _mm512_FMA_ps(y4, h3, _mm512_mul_ps(x4,h2)))); + + __m512 tau6 = _mm512_set1_ps(hh[ldh*5]); + __m512 vs_1_6 = _mm512_set1_ps(scalarprods[10]); + __m512 vs_2_6 = _mm512_set1_ps(scalarprods[11]); + h2 = _mm512_mul_ps(tau6, vs_1_6); + h3 = _mm512_mul_ps(tau6, vs_2_6); + + __m512 vs_3_6 = _mm512_set1_ps(scalarprods[12]); + __m512 vs_4_6 = _mm512_set1_ps(scalarprods[13]); + __m512 vs_5_6 = _mm512_set1_ps(scalarprods[14]); + + h4 = _mm512_mul_ps(tau6, vs_3_6); + h5 = _mm512_mul_ps(tau6, vs_4_6); + h6 = _mm512_mul_ps(tau6, vs_5_6); + + t1 = _mm512_FMSUB_ps(t1, tau6, _mm512_FMA_ps(v1, h6, _mm512_add_ps(_mm512_FMA_ps(w1, h5, _mm512_mul_ps(z1,h4)), _mm512_FMA_ps(y1, h3, _mm512_mul_ps(x1,h2))))); +// t2 = _mm512_FMSUB_ps(t2, tau6, _mm512_FMA_ps(v2, h6, _mm512_add_ps(_mm512_FMA_ps(w2, h5, _mm512_mul_ps(z2,h4)), _mm512_FMA_ps(y2, h3, _mm512_mul_ps(x2,h2))))); +// t3 = _mm512_FMSUB_ps(t3, tau6, _mm512_FMA_ps(v3, h6, _mm512_add_ps(_mm512_FMA_ps(w3, h5, _mm512_mul_ps(z3,h4)), _mm512_FMA_ps(y3, h3, _mm512_mul_ps(x3,h2))))); +// t4 = _mm512_FMSUB_ps(t4, tau6, _mm512_FMA_ps(v4, h6, _mm512_add_ps(_mm512_FMA_ps(w4, h5, _mm512_mul_ps(z4,h4)), _mm512_FMA_ps(y4, h3, _mm512_mul_ps(x4,h2))))); + + + ///////////////////////////////////////////////////// + // Rank-1 update of Q [8 x nb+3] + ///////////////////////////////////////////////////// + + q1 = _mm512_load_ps(&q[0]); +// q2 = _mm512_load_ps(&q[0+16]); +// q3 = _mm512_load_ps(&q[0+32]); +// q4 = _mm512_load_ps(&q[0+48]); + + q1 = _mm512_sub_ps(q1, t1); +// q2 = _mm512_sub_ps(q2, t2); +// q3 = _mm512_sub_ps(q3, t3); +// q4 = _mm512_sub_ps(q4, t4); + + _mm512_store_ps(&q[0],q1); +// _mm512_store_ps(&q[0+16],q2); +// _mm512_store_ps(&q[0+32],q3); +// _mm512_store_ps(&q[0+48],q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+1]); + q1 = _mm512_load_ps(&q[ldq]); +// q2 = _mm512_load_ps(&q[ldq+16]); +// q3 = _mm512_load_ps(&q[ldq+32]); +// q4 = _mm512_load_ps(&q[ldq+48]); + + q1 = _mm512_sub_ps(q1, v1); +// q2 = _mm512_sub_ps(q2, v2); +// q3 = _mm512_sub_ps(q3, v3); +// q4 = _mm512_sub_ps(q4, v4); + + q1 = _mm512_NFMA_ps(t1, h6, q1); +// q2 = _mm512_NFMA_ps(t2, h6, q2); +// q3 = _mm512_NFMA_ps(t3, h6, q3); +// q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[ldq],q1); +// _mm512_store_ps(&q[ldq+16],q2); +// _mm512_store_ps(&q[ldq+32],q3); +// _mm512_store_ps(&q[ldq+48],q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+1]); + q1 = _mm512_load_ps(&q[ldq*2]); +// q2 = _mm512_load_ps(&q[(ldq*2)+16]); +// q3 = _mm512_load_ps(&q[(ldq*2)+32]); +// q4 = _mm512_load_ps(&q[(ldq*2)+48]); + + q1 = _mm512_sub_ps(q1, w1); +// q2 = _mm512_sub_ps(q2, w2); +// q3 = _mm512_sub_ps(q3, w3); +// q4 = _mm512_sub_ps(q4, w4); + + q1 = _mm512_NFMA_ps(v1, h5, q1); +// q2 = _mm512_NFMA_ps(v2, h5, q2); +// q3 = _mm512_NFMA_ps(v3, h5, q3); +// q4 = _mm512_NFMA_ps(v4, h5, q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+2]); + + q1 = _mm512_NFMA_ps(t1, h6, q1); +// q2 = _mm512_NFMA_ps(t2, h6, q2); +// q3 = _mm512_NFMA_ps(t3, h6, q3); +// q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[ldq*2],q1); +// _mm512_store_ps(&q[(ldq*2)+16],q2); +// _mm512_store_ps(&q[(ldq*2)+32],q3); +// _mm512_store_ps(&q[(ldq*2)+48],q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+1]); + q1 = _mm512_load_ps(&q[ldq*3]); +// q2 = _mm512_load_ps(&q[(ldq*3)+16]); +// q3 = _mm512_load_ps(&q[(ldq*3)+32]); +// q4 = _mm512_load_ps(&q[(ldq*3)+48]); + + q1 = _mm512_sub_ps(q1, z1); +// q2 = _mm512_sub_ps(q2, z2); +// q3 = _mm512_sub_ps(q3, z3); +// q4 = _mm512_sub_ps(q4, z4); + + q1 = _mm512_NFMA_ps(w1, h4, q1); +// q2 = _mm512_NFMA_ps(w2, h4, q2); +// q3 = _mm512_NFMA_ps(w3, h4, q3); +// q4 = _mm512_NFMA_ps(w4, h4, q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+2]); + + q1 = _mm512_NFMA_ps(v1, h5, q1); +// q2 = _mm512_NFMA_ps(v2, h5, q2); +// q3 = _mm512_NFMA_ps(v3, h5, q3); +// q4 = _mm512_NFMA_ps(v4, h5, q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+3]); + + q1 = _mm512_NFMA_ps(t1, h6, q1); +// q2 = _mm512_NFMA_ps(t2, h6, q2); +// q3 = _mm512_NFMA_ps(t3, h6, q3); +// q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[ldq*3],q1); +// _mm512_store_ps(&q[(ldq*3)+16],q2); +// _mm512_store_ps(&q[(ldq*3)+32],q3); +// _mm512_store_ps(&q[(ldq*3)+48],q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+1]); + q1 = _mm512_load_ps(&q[ldq*4]); +// q2 = _mm512_load_ps(&q[(ldq*4)+16]); +// q3 = _mm512_load_ps(&q[(ldq*4)+32]); +// q4 = _mm512_load_ps(&q[(ldq*4)+48]); + + q1 = _mm512_sub_ps(q1, y1); +// q2 = _mm512_sub_ps(q2, y2); +// q3 = _mm512_sub_ps(q3, y3); +// q4 = _mm512_sub_ps(q4, y4); + + q1 = _mm512_NFMA_ps(z1, h3, q1); +// q2 = _mm512_NFMA_ps(z2, h3, q2); +// q3 = _mm512_NFMA_ps(z3, h3, q3); +// q4 = _mm512_NFMA_ps(z4, h3, q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+2]); + + q1 = _mm512_NFMA_ps(w1, h4, q1); +// q2 = _mm512_NFMA_ps(w2, h4, q2); +// q3 = _mm512_NFMA_ps(w3, h4, q3); +// q4 = _mm512_NFMA_ps(w4, h4, q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+3]); + + q1 = _mm512_NFMA_ps(v1, h5, q1); +// q2 = _mm512_NFMA_ps(v2, h5, q2); +// q3 = _mm512_NFMA_ps(v3, h5, q3); +// q4 = _mm512_NFMA_ps(v4, h5, q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+4]); + + q1 = _mm512_NFMA_ps(t1, h6, q1); +// q2 = _mm512_NFMA_ps(t2, h6, q2); +// q3 = _mm512_NFMA_ps(t3, h6, q3); +// q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[ldq*4],q1); +// _mm512_store_ps(&q[(ldq*4)+16],q2); +// _mm512_store_ps(&q[(ldq*4)+32],q3); +// _mm512_store_ps(&q[(ldq*4)+48],q4); + + h2 = _mm512_set1_ps(hh[(ldh)+1]); + q1 = _mm512_load_ps(&q[ldq*5]); +// q2 = _mm512_load_ps(&q[(ldq*5)+16]); +// q3 = _mm512_load_ps(&q[(ldq*5)+32]); +// q4 = _mm512_load_ps(&q[(ldq*5)+48]); + + q1 = _mm512_sub_ps(q1, x1); +// q2 = _mm512_sub_ps(q2, x2); +// q3 = _mm512_sub_ps(q3, x3); +// q4 = _mm512_sub_ps(q4, x4); + + q1 = _mm512_NFMA_ps(y1, h2, q1); +// q2 = _mm512_NFMA_ps(y2, h2, q2); +// q3 = _mm512_NFMA_ps(y3, h2, q3); +// q4 = _mm512_NFMA_ps(y4, h2, q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+2]); + + q1 = _mm512_NFMA_ps(z1, h3, q1); +// q2 = _mm512_NFMA_ps(z2, h3, q2); +// q3 = _mm512_NFMA_ps(z3, h3, q3); +// q4 = _mm512_NFMA_ps(z4, h3, q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+3]); + + q1 = _mm512_NFMA_ps(w1, h4, q1); +// q2 = _mm512_NFMA_ps(w2, h4, q2); +// q3 = _mm512_NFMA_ps(w3, h4, q3); +// q4 = _mm512_NFMA_ps(w4, h4, q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+4]); + + q1 = _mm512_NFMA_ps(v1, h5, q1); +// q2 = _mm512_NFMA_ps(v2, h5, q2); +// q3 = _mm512_NFMA_ps(v3, h5, q3); +// q4 = _mm512_NFMA_ps(v4, h5, q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+5]); + + q1 = _mm512_NFMA_ps(t1, h6, q1); +// q2 = _mm512_NFMA_ps(t2, h6, q2); +// q3 = _mm512_NFMA_ps(t3, h6, q3); +// q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[ldq*5],q1); +// _mm512_store_ps(&q[(ldq*5)+16],q2); +// _mm512_store_ps(&q[(ldq*5)+32],q3); +// _mm512_store_ps(&q[(ldq*5)+48],q4); + + for (i = 6; i < nb; i++) + { + q1 = _mm512_load_ps(&q[i*ldq]); +// q2 = _mm512_load_ps(&q[(i*ldq)+16]); +// q3 = _mm512_load_ps(&q[(i*ldq)+32]); +// q4 = _mm512_load_ps(&q[(i*ldq)+48]); + + h1 = _mm512_set1_ps(hh[i-5]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); +// q2 = _mm512_NFMA_ps(x2, h1, q2); +// q3 = _mm512_NFMA_ps(x3, h1, q3); +// q4 = _mm512_NFMA_ps(x4, h1, q4); + + h2 = _mm512_set1_ps(hh[ldh+i-4]); + + q1 = _mm512_NFMA_ps(y1, h2, q1); +// q2 = _mm512_NFMA_ps(y2, h2, q2); +// q3 = _mm512_NFMA_ps(y3, h2, q3); + // q4 = _mm512_NFMA_ps(y4, h2, q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+i-3]); + + q1 = _mm512_NFMA_ps(z1, h3, q1); +// q2 = _mm512_NFMA_ps(z2, h3, q2); +// q3 = _mm512_NFMA_ps(z3, h3, q3); +// q4 = _mm512_NFMA_ps(z4, h3, q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+i-2]); + + q1 = _mm512_NFMA_ps(w1, h4, q1); +// q2 = _mm512_NFMA_ps(w2, h4, q2); +// q3 = _mm512_NFMA_ps(w3, h4, q3); +// q4 = _mm512_NFMA_ps(w4, h4, q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+i-1]); + + q1 = _mm512_NFMA_ps(v1, h5, q1); +// q2 = _mm512_NFMA_ps(v2, h5, q2); +// q3 = _mm512_NFMA_ps(v3, h5, q3); +// q4 = _mm512_NFMA_ps(v4, h5, q4); + + h6 = _mm512_set1_ps(hh[(ldh*5)+i]); + + q1 = _mm512_NFMA_ps(t1, h6, q1); +// q2 = _mm512_NFMA_ps(t2, h6, q2); +// q3 = _mm512_NFMA_ps(t3, h6, q3); +// q4 = _mm512_NFMA_ps(t4, h6, q4); + + _mm512_store_ps(&q[i*ldq],q1); +// _mm512_store_ps(&q[(i*ldq)+16],q2); +// _mm512_store_ps(&q[(i*ldq)+32],q3); +// _mm512_store_ps(&q[(i*ldq)+48],q4); + + } + + h1 = _mm512_set1_ps(hh[nb-5]); + q1 = _mm512_load_ps(&q[nb*ldq]); +// q2 = _mm512_load_ps(&q[(nb*ldq)+16]); +// q3 = _mm512_load_ps(&q[(nb*ldq)+32]); +// q4 = _mm512_load_ps(&q[(nb*ldq)+48]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); +// q2 = _mm512_NFMA_ps(x2, h1, q2); +// q3 = _mm512_NFMA_ps(x3, h1, q3); +// q4 = _mm512_NFMA_ps(x4, h1, q4); + + h2 = _mm512_set1_ps(hh[ldh+nb-4]); + + q1 = _mm512_NFMA_ps(y1, h2, q1); +// q2 = _mm512_NFMA_ps(y2, h2, q2); +// q3 = _mm512_NFMA_ps(y3, h2, q3); +// q4 = _mm512_NFMA_ps(y4, h2, q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-3]); + + q1 = _mm512_NFMA_ps(z1, h3, q1); +// q2 = _mm512_NFMA_ps(z2, h3, q2); +// q3 = _mm512_NFMA_ps(z3, h3, q3); +// q4 = _mm512_NFMA_ps(z4, h3, q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+nb-2]); + + q1 = _mm512_NFMA_ps(w1, h4, q1); +// q2 = _mm512_NFMA_ps(w2, h4, q2); +// q3 = _mm512_NFMA_ps(w3, h4, q3); +// q4 = _mm512_NFMA_ps(w4, h4, q4); + + h5 = _mm512_set1_ps(hh[(ldh*4)+nb-1]); + + q1 = _mm512_NFMA_ps(v1, h5, q1); +// q2 = _mm512_NFMA_ps(v2, h5, q2); +// q3 = _mm512_NFMA_ps(v3, h5, q3); +// q4 = _mm512_NFMA_ps(v4, h5, q4); + + _mm512_store_ps(&q[nb*ldq],q1); +// _mm512_store_ps(&q[(nb*ldq)+16],q2); +// _mm512_store_ps(&q[(nb*ldq)+32],q3); +// _mm512_store_ps(&q[(nb*ldq)+48],q4); + + h1 = _mm512_set1_ps(hh[nb-4]); + q1 = _mm512_load_ps(&q[(nb+1)*ldq]); +// q2 = _mm512_load_ps(&q[((nb+1)*ldq)+16]); +// q3 = _mm512_load_ps(&q[((nb+1)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+1)*ldq)+48]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); +// q2 = _mm512_NFMA_ps(x2, h1, q2); +// q3 = _mm512_NFMA_ps(x3, h1, q3); +// q4 = _mm512_NFMA_ps(x4, h1, q4); + + h2 = _mm512_set1_ps(hh[ldh+nb-3]); + + q1 = _mm512_NFMA_ps(y1, h2, q1); +// q2 = _mm512_NFMA_ps(y2, h2, q2); +// q3 = _mm512_NFMA_ps(y3, h2, q3); +// q4 = _mm512_NFMA_ps(y4, h2, q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-2]); + + q1 = _mm512_NFMA_ps(z1, h3, q1); +// q2 = _mm512_NFMA_ps(z2, h3, q2); +// q3 = _mm512_NFMA_ps(z3, h3, q3); +// q4 = _mm512_NFMA_ps(z4, h3, q4); + + h4 = _mm512_set1_ps(hh[(ldh*3)+nb-1]); + + q1 = _mm512_NFMA_ps(w1, h4, q1); +// q2 = _mm512_NFMA_ps(w2, h4, q2); +// q3 = _mm512_NFMA_ps(w3, h4, q3); +// q4 = _mm512_NFMA_ps(w4, h4, q4); + + _mm512_store_ps(&q[(nb+1)*ldq],q1); +// _mm512_store_ps(&q[((nb+1)*ldq)+16],q2); +// _mm512_store_ps(&q[((nb+1)*ldq)+32],q3); +// _mm512_store_ps(&q[((nb+1)*ldq)+48],q4); + + h1 = _mm512_set1_ps(hh[nb-3]); + q1 = _mm512_load_ps(&q[(nb+2)*ldq]); +// q2 = _mm512_load_ps(&q[((nb+2)*ldq)+16]); +// q3 = _mm512_load_ps(&q[((nb+2)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+2)*ldq)+48]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); +// q2 = _mm512_NFMA_ps(x2, h1, q2); +// q3 = _mm512_NFMA_ps(x3, h1, q3); +// q4 = _mm512_NFMA_ps(x4, h1, q4); + + h2 = _mm512_set1_ps(hh[ldh+nb-2]); + + q1 = _mm512_NFMA_ps(y1, h2, q1); +// q2 = _mm512_NFMA_ps(y2, h2, q2); +// q3 = _mm512_NFMA_ps(y3, h2, q3); +// q4 = _mm512_NFMA_ps(y4, h2, q4); + + h3 = _mm512_set1_ps(hh[(ldh*2)+nb-1]); + + q1 = _mm512_NFMA_ps(z1, h3, q1); +// q2 = _mm512_NFMA_ps(z2, h3, q2); +// q3 = _mm512_NFMA_ps(z3, h3, q3); +// q4 = _mm512_NFMA_ps(z4, h3, q4); + + _mm512_store_ps(&q[(nb+2)*ldq],q1); +// _mm512_store_ps(&q[((nb+2)*ldq)+16],q2); +// _mm512_store_ps(&q[((nb+2)*ldq)+32],q3); +// _mm512_store_ps(&q[((nb+2)*ldq)+48],q4); + + h1 = _mm512_set1_ps(hh[nb-2]); + q1 = _mm512_load_ps(&q[(nb+3)*ldq]); +// q2 = _mm512_load_ps(&q[((nb+3)*ldq)+16]); +// q3 = _mm512_load_ps(&q[((nb+3)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+3)*ldq)+48]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); +// q2 = _mm512_NFMA_ps(x2, h1, q2); +// q3 = _mm512_NFMA_ps(x3, h1, q3); +// q4 = _mm512_NFMA_ps(x4, h1, q4); + + h2 = _mm512_set1_ps(hh[ldh+nb-1]); + + q1 = _mm512_NFMA_ps(y1, h2, q1); +// q2 = _mm512_NFMA_ps(y2, h2, q2); +// q3 = _mm512_NFMA_ps(y3, h2, q3); +// q4 = _mm512_NFMA_ps(y4, h2, q4); + + _mm512_store_ps(&q[(nb+3)*ldq],q1); +// _mm512_store_ps(&q[((nb+3)*ldq)+16],q2); +// _mm512_store_ps(&q[((nb+3)*ldq)+32],q3); +// _mm512_store_ps(&q[((nb+3)*ldq)+48],q4); + + h1 = _mm512_set1_ps(hh[nb-1]); + q1 = _mm512_load_ps(&q[(nb+4)*ldq]); +// q2 = _mm512_load_ps(&q[((nb+4)*ldq)+16]); +// q3 = _mm512_load_ps(&q[((nb+4)*ldq)+32]); +// q4 = _mm512_load_ps(&q[((nb+4)*ldq)+48]); + + q1 = _mm512_NFMA_ps(x1, h1, q1); +// q2 = _mm512_NFMA_ps(x2, h1, q2); +// q3 = _mm512_NFMA_ps(x3, h1, q3); +// q4 = _mm512_NFMA_ps(x4, h1, q4); + + _mm512_store_ps(&q[(nb+4)*ldq],q1); +// _mm512_store_ps(&q[((nb+4)*ldq)+16],q2); +// _mm512_store_ps(&q[((nb+4)*ldq)+32],q3); +// _mm512_store_ps(&q[((nb+4)*ldq)+48],q4); + } diff --git a/src/mod_compute_hh_trafo_real.F90 b/src/mod_compute_hh_trafo_real.F90 index 49a6445f188d64b3c0c351a8d146e9349687d460..7ca7d1f6e2f8d0e6b2db7a58fecaa1b09692b270 100644 --- a/src/mod_compute_hh_trafo_real.F90 +++ b/src/mod_compute_hh_trafo_real.F90 @@ -1407,66 +1407,66 @@ module compute_hh_trafo_real #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_REAL_AVX_BLOCK6_KERNEL || WITH_REAL_AVX2_BLOCK6_KERNEL */ -!#if defined(WITH_REAL_AVX512_BLOCK6_KERNEL) -!#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) -! -! if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK6)) then -!#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ -! ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS -! do j = ncols, 6, -6 -! w(:,1) = bcast_buffer(1:nbw,j+off) -! w(:,2) = bcast_buffer(1:nbw,j+off-1) -! w(:,3) = bcast_buffer(1:nbw,j+off-2) -! w(:,4) = bcast_buffer(1:nbw,j+off-3) -! w(:,5) = bcast_buffer(1:nbw,j+off-4) -! w(:,6) = bcast_buffer(1:nbw,j+off-5) -! -!#ifdef WITH_OPENMP -! call hexa_hh_trafo_real_avx512_6hv_single(c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, & -! nbw, nl, stripe_width, nbw) -!#else -! call hexa_hh_trafo_real_avx512_6hv_single(c_loc(a(1,j+off+a_off-5,istripe)), w, & -! nbw, nl, stripe_width, nbw) -!#endif -! enddo -! do jj = j, 4, -4 -! w(:,1) = bcast_buffer(1:nbw,jj+off) -! w(:,2) = bcast_buffer(1:nbw,jj+off-1) -! w(:,3) = bcast_buffer(1:nbw,jj+off-2) -! w(:,4) = bcast_buffer(1:nbw,jj+off-3) -! -!#ifdef WITH_OPENMP -! call quad_hh_trafo_real_avx512_4hv_single(c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, & -! nbw, nl, stripe_width, nbw) -!#else -! call quad_hh_trafo_real_avx512_4hv_single(c_loc(a(1,jj+off+a_off-3,istripe)), w, & -! nbw, nl, stripe_width, nbw) -!#endif -! enddo -! do jjj = jj, 2, -2 -! w(:,1) = bcast_buffer(1:nbw,jjj+off) -! w(:,2) = bcast_buffer(1:nbw,jjj+off-1) -! -!#ifdef WITH_OPENMP -! call double_hh_trafo_real_avx512_2hv_single(c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), & -! w, nbw, nl, stripe_width, nbw) -!#else -! call double_hh_trafo_real_avx512_2hv_single(c_loc(a(1,jjj+off+a_off-1,istripe)), & -! w, nbw, nl, stripe_width, nbw) -!#endif -! enddo -!#ifdef WITH_OPENMP -! if (jjj==1) call single_hh_trafo_real_cpu_openmp_single(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, & -! istripe,my_thread), & -! bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) -!#else -! if (jjj==1) call single_hh_trafo_real_cpu_single(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), & -! bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) -!#endif -!#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) -! endif -!#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ -!#endif /* WITH_REAL_AVX512_BLOCK6_KERNEL */ +#if defined(WITH_REAL_AVX512_BLOCK6_KERNEL) +#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) + + if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK6)) then +#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ + ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS + do j = ncols, 6, -6 + w(:,1) = bcast_buffer(1:nbw,j+off) + w(:,2) = bcast_buffer(1:nbw,j+off-1) + w(:,3) = bcast_buffer(1:nbw,j+off-2) + w(:,4) = bcast_buffer(1:nbw,j+off-3) + w(:,5) = bcast_buffer(1:nbw,j+off-4) + w(:,6) = bcast_buffer(1:nbw,j+off-5) + +#ifdef WITH_OPENMP + call hexa_hh_trafo_real_avx512_6hv_single(c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, & + nbw, nl, stripe_width, nbw) +#else + call hexa_hh_trafo_real_avx512_6hv_single(c_loc(a(1,j+off+a_off-5,istripe)), w, & + nbw, nl, stripe_width, nbw) +#endif + enddo + do jj = j, 4, -4 + w(:,1) = bcast_buffer(1:nbw,jj+off) + w(:,2) = bcast_buffer(1:nbw,jj+off-1) + w(:,3) = bcast_buffer(1:nbw,jj+off-2) + w(:,4) = bcast_buffer(1:nbw,jj+off-3) + +#ifdef WITH_OPENMP + call quad_hh_trafo_real_avx512_4hv_single(c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, & + nbw, nl, stripe_width, nbw) +#else + call quad_hh_trafo_real_avx512_4hv_single(c_loc(a(1,jj+off+a_off-3,istripe)), w, & + nbw, nl, stripe_width, nbw) +#endif + enddo + do jjj = jj, 2, -2 + w(:,1) = bcast_buffer(1:nbw,jjj+off) + w(:,2) = bcast_buffer(1:nbw,jjj+off-1) + +#ifdef WITH_OPENMP + call double_hh_trafo_real_avx512_2hv_single(c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), & + w, nbw, nl, stripe_width, nbw) +#else + call double_hh_trafo_real_avx512_2hv_single(c_loc(a(1,jjj+off+a_off-1,istripe)), & + w, nbw, nl, stripe_width, nbw) +#endif + enddo +#ifdef WITH_OPENMP + if (jjj==1) call single_hh_trafo_real_cpu_openmp_single(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, & + istripe,my_thread), & + bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) +#else + if (jjj==1) call single_hh_trafo_real_cpu_single(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), & + bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width) +#endif +#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) + endif +#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ +#endif /* WITH_REAL_AVX512_BLOCK6_KERNEL */ endif ! GPU_KERNEL