There is a maintenance of MPCDF Gitlab on Thursday, April 22st 2020, 9:00 am CEST - Expect some service interruptions during this time

Commit 3dfe8ad2 authored by Andreas Marek's avatar Andreas Marek

Start to unfiy double/single real AVX512 block6 kernel

parent f647c843
......@@ -599,6 +599,7 @@ EXTRA_DIST = \
src/elpa2/kernels/elpa2_kernels_complex_sse_1hv_template.Xc \
src/elpa2/kernels/elpa2_kernels_real_avx512_2hv_template.Xc \
src/elpa2/kernels/elpa2_kernels_real_avx512_4hv_template.Xc \
src/elpa2/kernels/elpa2_kernels_real_avx512_6hv_template.Xc \
src/elpa2/kernels/elpa2_kernels_complex_sse_2hv_template.Xc \
src/elpa2/kernels/elpa2_kernels_complex_avx-avx2_1hv_template.Xc \
src/elpa2/kernels/elpa2_kernels_complex_avx-avx2_2hv_template.Xc \
......
......@@ -42,3560 +42,14 @@
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
// Author: Andreas Marek, MPCDF
#include "config-f90.h"
#include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) static
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "../../general/precision_macros.h"
#include "elpa2_kernels_real_avx512_6hv_template.Xc"
#undef REALCASE
#undef DOUBLE_PRECISION
#ifdef HAVE_AVX512
#define __ELPA_USE_FMA__
#define _mm512_FMA_pd(a,b,c) _mm512_fmadd_pd(a,b,c)
#define _mm512_NFMA_pd(a,b,c) _mm512_fnmadd_pd(a,b,c)
#define _mm512_FMSUB_pd(a,b,c) _mm512_fmsub_pd(a,b,c)
#endif
//Forward declaration
//static void hh_trafo_kernel_4_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
static void hh_trafo_kernel_8_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
static void hh_trafo_kernel_16_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
static void hh_trafo_kernel_24_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
static void hh_trafo_kernel_32_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
void hexa_hh_trafo_real_avx512_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
/*
!f>#if defined(HAVE_AVX512)
!f> interface
!f> subroutine hexa_hh_trafo_real_avx512_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_avx512_6hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_double) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
void hexa_hh_trafo_real_avx512_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{
int i;
int nb = *pnb;
int nq = *pldq;
int ldq = *pldq;
int ldh = *pldh;
// calculating scalar products to compute
// 6 householder vectors simultaneously
double scalarprods[15];
scalarprods[0] = hh[(ldh+1)];
scalarprods[1] = hh[(ldh*2)+2];
scalarprods[2] = hh[(ldh*2)+1];
scalarprods[3] = hh[(ldh*3)+3];
scalarprods[4] = hh[(ldh*3)+2];
scalarprods[5] = hh[(ldh*3)+1];
scalarprods[6] = hh[(ldh*4)+4];
scalarprods[7] = hh[(ldh*4)+3];
scalarprods[8] = hh[(ldh*4)+2];
scalarprods[9] = hh[(ldh*4)+1];
scalarprods[10] = hh[(ldh*5)+5];
scalarprods[11] = hh[(ldh*5)+4];
scalarprods[12] = hh[(ldh*5)+3];
scalarprods[13] = hh[(ldh*5)+2];
scalarprods[14] = hh[(ldh*5)+1];
// calculate scalar product of first and fourth householder Vector
// loop counter = 2
scalarprods[0] += hh[1] * hh[(2+ldh)];
scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
// loop counter = 3
scalarprods[0] += hh[2] * hh[(3+ldh)];
scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
scalarprods[1] += hh[1] * hh[3+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
// loop counter = 4
scalarprods[0] += hh[3] * hh[(4+ldh)];
scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
scalarprods[1] += hh[2] * hh[4+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
scalarprods[3] += hh[1] * hh[4+(ldh*3)];
scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
// loop counter = 5
scalarprods[0] += hh[4] * hh[(5+ldh)];
scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];
scalarprods[1] += hh[3] * hh[5+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];
scalarprods[3] += hh[2] * hh[5+(ldh*3)];
scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];
scalarprods[6] += hh[1] * hh[5+(ldh*4)];
scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];
#pragma ivdep
for (i = 6; i < nb; i++)
{
scalarprods[0] += hh[i-1] * hh[(i+ldh)];
scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];
scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];
scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];
scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];
scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
}
// Production level kernel calls with padding
for (i = 0; i < nq-24; i+=32)
{
hh_trafo_kernel_32_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
}
if (nq == i)
{
return;
}
if (nq-i == 24)
{
hh_trafo_kernel_24_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
}
if (nq-i == 16)
{
hh_trafo_kernel_16_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
}
else
{
hh_trafo_kernel_8_AVX512_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
}
}
/**
* Unrolled kernel that computes
* 8 rows of Q simultaneously, a
* matrix Vector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline void hh_trafo_kernel_8_AVX512_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [8 x nb+3] * hh
// hh contains four householder vectors
/////////////////////////////////////////////////////
int i;
__m512d a1_1 = _mm512_load_pd(&q[ldq*5]);
__m512d a2_1 = _mm512_load_pd(&q[ldq*4]);
__m512d a3_1 = _mm512_load_pd(&q[ldq*3]);
__m512d a4_1 = _mm512_load_pd(&q[ldq*2]);
__m512d a5_1 = _mm512_load_pd(&q[ldq]);
__m512d a6_1 = _mm512_load_pd(&q[0]);
__m512d h_6_5 = _mm512_set1_pd(hh[(ldh*5)+1]);
__m512d h_6_4 = _mm512_set1_pd(hh[(ldh*5)+2]);
__m512d h_6_3 = _mm512_set1_pd(hh[(ldh*5)+3]);
__m512d h_6_2 = _mm512_set1_pd(hh[(ldh*5)+4]);
__m512d h_6_1 = _mm512_set1_pd(hh[(ldh*5)+5]);
// register __m512d t1 = _mm512_FMA_pd(a5_1, h_6_5, a6_1);
__m512d t1 = _mm512_FMA_pd(a5_1, h_6_5, a6_1);
t1 = _mm512_FMA_pd(a4_1, h_6_4, t1);
t1 = _mm512_FMA_pd(a3_1, h_6_3, t1);
t1 = _mm512_FMA_pd(a2_1, h_6_2, t1);
t1 = _mm512_FMA_pd(a1_1, h_6_1, t1);
__m512d h_5_4 = _mm512_set1_pd(hh[(ldh*4)+1]);
__m512d h_5_3 = _mm512_set1_pd(hh[(ldh*4)+2]);
__m512d h_5_2 = _mm512_set1_pd(hh[(ldh*4)+3]);
__m512d h_5_1 = _mm512_set1_pd(hh[(ldh*4)+4]);
// register __m512d v1 = _mm512_FMA_pd(a4_1, h_5_4, a5_1);
__m512d v1 = _mm512_FMA_pd(a4_1, h_5_4, a5_1);
v1 = _mm512_FMA_pd(a3_1, h_5_3, v1);
v1 = _mm512_FMA_pd(a2_1, h_5_2, v1);
v1 = _mm512_FMA_pd(a1_1, h_5_1, v1);
__m512d h_4_3 = _mm512_set1_pd(hh[(ldh*3)+1]);
__m512d h_4_2 = _mm512_set1_pd(hh[(ldh*3)+2]);
__m512d h_4_1 = _mm512_set1_pd(hh[(ldh*3)+3]);
// register __m512d w1 = _mm512_FMA_pd(a3_1, h_4_3, a4_1);
__m512d w1 = _mm512_FMA_pd(a3_1, h_4_3, a4_1);
w1 = _mm512_FMA_pd(a2_1, h_4_2, w1);
w1 = _mm512_FMA_pd(a1_1, h_4_1, w1);
__m512d h_2_1 = _mm512_set1_pd(hh[ldh+1]);
__m512d h_3_2 = _mm512_set1_pd(hh[(ldh*2)+1]);
__m512d h_3_1 = _mm512_set1_pd(hh[(ldh*2)+2]);
// register __m512d z1 = _mm512_FMA_pd(a2_1, h_3_2, a3_1);
__m512d z1 = _mm512_FMA_pd(a2_1, h_3_2, a3_1);
z1 = _mm512_FMA_pd(a1_1, h_3_1, z1);
// register __m512d y1 = _mm512_FMA_pd(a1_1, h_2_1, a2_1);
__m512d y1 = _mm512_FMA_pd(a1_1, h_2_1, a2_1);
// register __m512d x1 = a1_1;
__m512d x1 = a1_1;
__m512d q1;
__m512d h1;
__m512d h2;
__m512d h3;
__m512d h4;
__m512d h5;
__m512d h6;
for(i = 6; i < nb; i++)
{
h1 = _mm512_set1_pd(hh[i-5]);
q1 = _mm512_load_pd(&q[i*ldq]);
x1 = _mm512_FMA_pd(q1, h1, x1);
h2 = _mm512_set1_pd(hh[ldh+i-4]);
y1 = _mm512_FMA_pd(q1, h2, y1);
h3 = _mm512_set1_pd(hh[(ldh*2)+i-3]);
z1 = _mm512_FMA_pd(q1, h3, z1);
h4 = _mm512_set1_pd(hh[(ldh*3)+i-2]);
w1 = _mm512_FMA_pd(q1, h4, w1);
h5 = _mm512_set1_pd(hh[(ldh*4)+i-1]);
v1 = _mm512_FMA_pd(q1, h5, v1);
h6 = _mm512_set1_pd(hh[(ldh*5)+i]);
t1 = _mm512_FMA_pd(q1, h6, t1);
}
h1 = _mm512_set1_pd(hh[nb-5]);
q1 = _mm512_load_pd(&q[nb*ldq]);
x1 = _mm512_FMA_pd(q1, h1, x1);
h2 = _mm512_set1_pd(hh[ldh+nb-4]);
y1 = _mm512_FMA_pd(q1, h2, y1);
h3 = _mm512_set1_pd(hh[(ldh*2)+nb-3]);
z1 = _mm512_FMA_pd(q1, h3, z1);
h4 = _mm512_set1_pd(hh[(ldh*3)+nb-2]);
w1 = _mm512_FMA_pd(q1, h4, w1);
h5 = _mm512_set1_pd(hh[(ldh*4)+nb-1]);
v1 = _mm512_FMA_pd(q1, h5, v1);
h1 = _mm512_set1_pd(hh[nb-4]);
q1 = _mm512_load_pd(&q[(nb+1)*ldq]);
x1 = _mm512_FMA_pd(q1, h1, x1);
h2 = _mm512_set1_pd(hh[ldh+nb-3]);
y1 = _mm512_FMA_pd(q1, h2, y1);
h3 = _mm512_set1_pd(hh[(ldh*2)+nb-2]);
z1 = _mm512_FMA_pd(q1, h3, z1);
h4 = _mm512_set1_pd(hh[(ldh*3)+nb-1]);
w1 = _mm512_FMA_pd(q1, h4, w1);
h1 = _mm512_set1_pd(hh[nb-3]);
q1 = _mm512_load_pd(&q[(nb+2)*ldq]);
x1 = _mm512_FMA_pd(q1, h1, x1);
h2 = _mm512_set1_pd(hh[ldh+nb-2]);
y1 = _mm512_FMA_pd(q1, h2, y1);
h3 = _mm512_set1_pd(hh[(ldh*2)+nb-1]);
z1 = _mm512_FMA_pd(q1, h3, z1);
h1 = _mm512_set1_pd(hh[nb-2]);
q1 = _mm512_load_pd(&q[(nb+3)*ldq]);
x1 = _mm512_FMA_pd(q1, h1, x1);
h2 = _mm512_set1_pd(hh[ldh+nb-1]);
y1 = _mm512_FMA_pd(q1, h2, y1);
h1 = _mm512_set1_pd(hh[nb-1]);
q1 = _mm512_load_pd(&q[(nb+4)*ldq]);
x1 = _mm512_FMA_pd(q1, h1, x1);
/////////////////////////////////////////////////////
// Apply tau, correct wrong calculation using pre-calculated scalar products
/////////////////////////////////////////////////////
__m512d tau1 = _mm512_set1_pd(hh[0]);
x1 = _mm512_mul_pd(x1, tau1);
__m512d tau2 = _mm512_set1_pd(hh[ldh]);
__m512d vs_1_2 = _mm512_set1_pd(scalarprods[0]);
h2 = _mm512_mul_pd(tau2, vs_1_2);
y1 = _mm512_FMSUB_pd(y1, tau2, _mm512_mul_pd(x1,h2));
__m512d tau3 = _mm512_set1_pd(hh[ldh*2]);
__m512d vs_1_3 = _mm512_set1_pd(scalarprods[1]);
__m512d vs_2_3 = _mm512_set1_pd(scalarprods[2]);
h2 = _mm512_mul_pd(tau3, vs_1_3);
h3 = _mm512_mul_pd(tau3, vs_2_3);
z1 = _mm512_FMSUB_pd(z1, tau3, _mm512_FMA_pd(y1, h3, _mm512_mul_pd(x1,h2)));
__m512d tau4 = _mm512_set1_pd(hh[ldh*3]);
__m512d vs_1_4 = _mm512_set1_pd(scalarprods[3]);
__m512d vs_2_4 = _mm512_set1_pd(scalarprods[4]);
h2 = _mm512_mul_pd(tau4, vs_1_4);
h3 = _mm512_mul_pd(tau4, vs_2_4);
__m512d vs_3_4 = _mm512_set1_pd(scalarprods[5]);
h4 = _mm512_mul_pd(tau4, vs_3_4);
w1 = _mm512_FMSUB_pd(w1, tau4, _mm512_FMA_pd(z1, h4, _mm512_FMA_pd(y1, h3, _mm512_mul_pd(x1,h2))));
__m512d tau5 = _mm512_set1_pd(hh[ldh*4]);
__m512d vs_1_5 = _mm512_set1_pd(scalarprods[6]);
__m512d vs_2_5 = _mm512_set1_pd(scalarprods[7]);
h2 = _mm512_mul_pd(tau5, vs_1_5);
h3 = _mm512_mul_pd(tau5, vs_2_5);
__m512d vs_3_5 = _mm512_set1_pd(scalarprods[8]);
__m512d vs_4_5 = _mm512_set1_pd(scalarprods[9]);
h4 = _mm512_mul_pd(tau5, vs_3_5);
h5 = _mm512_mul_pd(tau5, vs_4_5);
v1 = _mm512_FMSUB_pd(v1, tau5, _mm512_add_pd(_mm512_FMA_pd(w1, h5, _mm512_mul_pd(z1,h4)), _mm512_FMA_pd(y1, h3, _mm512_mul_pd(x1,h2))));
__m512d tau6 = _mm512_set1_pd(hh[ldh*5]);
__m512d vs_1_6 = _mm512_set1_pd(scalarprods[10]);
__m512d vs_2_6 = _mm512_set1_pd(scalarprods[11]);
h2 = _mm512_mul_pd(tau6, vs_1_6);
h3 = _mm512_mul_pd(tau6, vs_2_6);
__m512d vs_3_6 = _mm512_set1_pd(scalarprods[12]);
__m512d vs_4_6 = _mm512_set1_pd(scalarprods[13]);
__m512d vs_5_6 = _mm512_set1_pd(scalarprods[14]);
h4 = _mm512_mul_pd(tau6, vs_3_6);
h5 = _mm512_mul_pd(tau6, vs_4_6);
h6 = _mm512_mul_pd(tau6, vs_5_6);
t1 = _mm512_FMSUB_pd(t1, tau6, _mm512_FMA_pd(v1, h6, _mm512_add_pd(_mm512_FMA_pd(w1, h5, _mm512_mul_pd(z1,h4)), _mm512_FMA_pd(y1, h3, _mm512_mul_pd(x1,h2)))));
/////////////////////////////////////////////////////
// Rank-1 update of Q [8 x nb+3]
/////////////////////////////////////////////////////
q1 = _mm512_load_pd(&q[0]);
q1 = _mm512_sub_pd(q1, t1);
_mm512_store_pd(&q[0],q1);
h6 = _mm512_set1_pd(hh[(ldh*5)+1]);
q1 = _mm512_load_pd(&q[ldq]);
q1 = _mm512_sub_pd(q1, v1);
q1 = _mm512_NFMA_pd(t1, h6, q1);
_mm512_store_pd(&q[ldq],q1);
h5 = _mm512_set1_pd(hh[(ldh*4)+1]);
q1 = _mm512_load_pd(&q[ldq*2]);
q1 = _mm512_sub_pd(q1, w1);
q1 = _mm512_NFMA_pd(v1, h5, q1);
h6 = _mm512_set1_pd(hh[(ldh*5)+2]);
q1 = _mm512_NFMA_pd(t1, h6, q1);
_mm512_store_pd(&q[ldq*2],q1);
h4 = _mm512_set1_pd(hh[(ldh*3)+1]);
q1 = _mm512_load_pd(&q[ldq*3]);
q1 = _mm512_sub_pd(q1, z1);
q1 = _mm512_NFMA_pd(w1, h4, q1);
h5 = _mm512_set1_pd(hh[(ldh*4)+2]);
q1 = _mm512_NFMA_pd(v1, h5, q1);
h6 = _mm512_set1_pd(hh[(ldh*5)+3]);
q1 = _mm512_NFMA_pd(t1, h6, q1);
_mm512_store_pd(&q[ldq*3],q1);
h3 = _mm512_set1_pd(hh[(ldh*2)+1]);
q1 = _mm512_load_pd(&q[ldq*4]);
q1 = _mm512_sub_pd(q1, y1);
q1 = _mm512_NFMA_pd(z1, h3, q1);
h4 = _mm512_set1_pd(hh[(ldh*3)+2]);
q1 = _mm512_NFMA_pd(w1, h4, q1);
h5 = _mm512_set1_pd(hh[(ldh*4)+3]);
q1 = _mm512_NFMA_pd(v1, h5, q1);
h6 = _mm512_set1_pd(hh[(ldh*5)+4]);
q1 = _mm512_NFMA_pd(t1, h6, q1);
_mm512_store_pd(&q[ldq*4],q1);
h2 = _mm512_set1_pd(hh[(ldh)+1]);
q1 = _mm512_load_pd(&q[ldq*5]);
q1 = _mm512_sub_pd(q1, x1);
q1 = _mm512_NFMA_pd(y1, h2, q1);
h3 = _mm512_set1_pd(hh[(ldh*2)+2]);
q1 = _mm512_NFMA_pd(z1, h3, q1);
h4 = _mm512_set1_pd(hh[(ldh*3)+3]);
q1 = _mm512_NFMA_pd(w1, h4, q1);
h5 = _mm512_set1_pd(hh[(ldh*4)+4]);
q1 = _mm512_NFMA_pd(v1, h5, q1);
h6 = _mm512_set1_pd(hh[(ldh*5)+5]);
q1 = _mm512_NFMA_pd(t1, h6, q1);
_mm512_store_pd(&q[ldq*5],q1);
for (i = 6; i < nb; i++)
{
q1 = _mm512_load_pd(&q[i*ldq]);
h1 = _mm512_set1_pd(hh[i-5]);
q1 = _mm512_NFMA_pd(x1, h1, q1);
h2 = _mm512_set1_pd(hh[ldh+i-4]);
q1 = _mm512_NFMA_pd(y1, h2, q1);
h3 = _mm512_set1_pd(hh[(ldh*2)+i-3]);
q1 = _mm512_NFMA_pd(z1, h3, q1);
h4 = _mm512_set1_pd(hh[(ldh*3)+i-2]);
q1 = _mm512_NFMA_pd(w1, h4, q1);
h5 = _mm512_set1_pd(hh[(ldh*4)+i-1]);
q1 = _mm512_NFMA_pd(v1, h5, q1);
h6 = _mm512_set1_pd(hh[(ldh*5)+i]);
q1 = _mm512_NFMA_pd(t1, h6, q1);
_mm512_store_pd(&q[i*ldq],q1);
}
h1 = _mm512_set1_pd(hh[nb-5]);
q1 = _mm512_load_pd(&q[nb*ldq]);
q1 = _mm512_NFMA_pd(x1, h1, q1);
h2 = _mm512_set1_pd(hh[ldh+nb-4]);
q1 = _mm512_NFMA_pd(y1, h2, q1);
h3 = _mm512_set1_pd(hh[(ldh*2)+nb-3]);
q1 = _mm512_NFMA_pd(z1, h3, q1);
h4 = _mm512_set1_pd(hh[(ldh*3)+nb-2]);
q1 = _mm512_NFMA_pd(w1, h4, q1);