Commit e36e3f50 authored by Andreas Marek's avatar Andreas Marek
Browse files

First implementation of AVX-512 kernels

For double precision real case the block2 and block4 have
been ported as a first test to AVX-512
parent b7e778b4
......@@ -2200,6 +2200,34 @@ gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block2-complex-
- coverage_data
#real avx512 block2, complex avx2 block1 (emulated)
gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx512_block2-complex-avx_block1-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure FC=mpif90 CFLAGS="-O3 -mavx512f -mavx2 -mfma" FCFLAGS="-O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx512_block2-kernel-only --with-complex-avx_block1-kernel-only --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -knl -- make check TEST_FLAGS='100 25 16'
gortran-single-precision-mpi-openmp-ftimings-redirect-real-avx512_block2-complex-avx_block1-kernel-special-gcov-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure FC=mpif90 CFLAGS="--coverage -O3 -mavx512f -mavx2 -mfma" FCFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx512_block2-kernel-only --with-complex-avx_block1-kernel-only --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=1
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -knl -- make check TEST_FLAGS='150 50 16'
- ./ci_coverage_collect
artifacts:
paths:
- coverage_data
#real avx2 block2, complex avx2 block1 (emulated)
intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx2_block2-complex-avx2_block1-kernel-jobs:
......@@ -2266,6 +2294,34 @@ gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block4-complex-
paths:
- coverage_data
#real avx512 block4, complex avx block2 (emulated)
gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx2_block4-complex-avx2_block2-kernel-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure FC=mpif90 CFLAGS="-O3 -mavx512f -mavx2 -mfma" FCFLAGS="-O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx512_block4-kernel-only --with-complex-avx_block2-kernel-only --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -knl -- make check TEST_FLAGS='100 25 16'
gfortran-single-precision-mpi-openmp-ftimings-redirect-real-avx512_block4-complex-avx_block2-kernel-special-gcov-jobs:
tags:
- emulated
script:
- ./autogen.sh
- ./configure FC=mpif90 CFLAGS="--coverage -O3 -mavx512f -mavx2 -mfma" FCFLAGS="--coverage -O3 -march=haswell -mavx2 -mfma" SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --with-ftimings --with-redirect --with-real-avx2_block4-kernel-only --with-complex-avx2_block2-kernel-only --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=1
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/sde-external-7.45.0-2016-05-09-lin/sde -knl -- make check TEST_FLAGS='150 50 16'
- ./ci_coverage_collect
artifacts:
paths:
- coverage_data
#real avx2 block4, complex avx2 block2 (emulated)
......
......@@ -12,9 +12,12 @@
#define ELPA2_REAL_KERNEL_AVX2_BLOCK2 12
#define ELPA2_REAL_KERNEL_AVX2_BLOCK4 13
#define ELPA2_REAL_KERNEL_AVX2_BLOCK6 14
#define ELPA2_REAL_KERNEL_GPU 15
#define ELPA2_REAL_KERNEL_AVX512_BLOCK2 15
#define ELPA2_REAL_KERNEL_AVX512_BLOCK4 16
#define ELPA2_REAL_KERNEL_AVX512_BLOCK6 17
#define ELPA2_REAL_KERNEL_GPU 18
#define ELPA2_NUMBER_OF_REAL_KERNELS 15
#define ELPA2_NUMBER_OF_REAL_KERNELS 18
#define ELPA2_COMPLEX_KERNEL_GENERIC 1
#define ELPA2_COMPLEX_KERNEL_GENERIC_SIMPLE 2
......@@ -27,6 +30,9 @@
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 9
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK1 10
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2 11
#define ELPA2_COMPLEX_KERNEL_GPU 12
#define ELPA2_COMPLEX_KERNEL_AVX512_BLOCK1 12
#define ELPA2_COMPLEX_KERNEL_AVX512_BLOCK2 13
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 12
#define ELPA2_COMPLEX_KERNEL_GPU 14
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 14
......@@ -3558,11 +3558,30 @@
stripe_width = (l_nev-1)/stripe_count + 1
#endif
#ifdef DOUBLE_PRECISION_REAL
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK4 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK6) then
stripe_width = ((stripe_width+3)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
! (8 * sizeof(double) == 64)
else
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
! (4 * sizeof(double) == 32)
endif
#else
stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
! (8 * sizeof(float) == 32)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK4 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK6) then
stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes
! (16 * sizeof(float) == 64)
else
stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
! (8 * sizeof(float) == 32)
endif
#endif
else ! GPUs are used
stripe_width = 256 ! Must be a multiple of 4
......
......@@ -66,25 +66,17 @@
#define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_AVX2
#ifdef __FMA4__
#ifdef HAVE_AVX512
#define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
#define _mm512_FMA_pd(a,b,c) _mm512_fmadd_pd(a,b,c)
#endif
#ifdef __AVX2__
#define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
#endif
#endif
//Forward declaration
__forceinline void hh_trafo_kernel_4_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_8_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_16_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_24_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_32_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
/*
......@@ -120,9 +112,9 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
}
// Production level kernel calls with padding
for (i = 0; i < nq-20; i+=24)
for (i = 0; i < nq-24; i+=32)
{
hh_trafo_kernel_24_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_32_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
}
if (nq == i)
......@@ -130,29 +122,178 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
return;
}
if (nq-i == 20)
if (nq-i == 24)
{
hh_trafo_kernel_16_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_4_AVX512_2hv_double(&q[i+16], hh, nb, ldq, ldh, s);
hh_trafo_kernel_24_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
}
else if (nq-i == 16)
{
hh_trafo_kernel_16_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
}
else if (nq-i == 12)
else
{
hh_trafo_kernel_8_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
hh_trafo_kernel_4_AVX512_2hv_double(&q[i+8], hh, nb, ldq, ldh, s);
}
else if (nq-i == 8)
}
/**
* Unrolled kernel that computes
* 32 rows of Q simultaneously, a
* matrix vector product with two householder
* vectors + a rank 2 update is performed
*/
__forceinline void hh_trafo_kernel_32_AVX512_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [24 x nb+1] * hh
// hh contains two householder vectors, with offset 1
/////////////////////////////////////////////////////
int i;
// Needed bit mask for floating point sign flip
// carefull here
__m512d sign = (__m512d)_mm512_set1_epi64(0x8000000000000000);
__m512d x1 = _mm512_load_pd(&q[ldq]);
__m512d x2 = _mm512_load_pd(&q[ldq+8]);
__m512d x3 = _mm512_load_pd(&q[ldq+16]);
__m512d x4 = _mm512_load_pd(&q[ldq+24]);
__m512d h1 = _mm512_set1_pd(hh[ldh+1]);
__m512d h2;
__m512d q1 = _mm512_load_pd(q);
__m512d y1 = _mm512_FMA_pd(x1, h1, q1);
__m512d q2 = _mm512_load_pd(&q[8]);
__m512d y2 = _mm512_FMA_pd(x2, h1, q2);
__m512d q3 = _mm512_load_pd(&q[16]);
__m512d y3 = _mm512_FMA_pd(x3, h1, q3);
__m512d q4 = _mm512_load_pd(&q[24]);
__m512d y4 = _mm512_FMA_pd(x4, h1, q4);
for(i = 2; i < nb; i++)
{
hh_trafo_kernel_8_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
h1 = _mm512_set1_pd(hh[i-1]);
h2 = _mm512_set1_pd(hh[ldh+i]);
q1 = _mm512_load_pd(&q[i*ldq]);
x1 = _mm512_FMA_pd(q1, h1, x1);
y1 = _mm512_FMA_pd(q1, h2, y1);
q2 = _mm512_load_pd(&q[(i*ldq)+8]);
x2 = _mm512_FMA_pd(q2, h1, x2);
y2 = _mm512_FMA_pd(q2, h2, y2);
q3 = _mm512_load_pd(&q[(i*ldq)+16]);
x3 = _mm512_FMA_pd(q3, h1, x3);
y3 = _mm512_FMA_pd(q3, h2, y3);
q4 = _mm512_load_pd(&q[(i*ldq)+24]);
x4 = _mm512_FMA_pd(q4, h1, x4);
y4 = _mm512_FMA_pd(q4, h2, y4);
}
else
h1 = _mm512_set1_pd(hh[nb-1]);
q1 = _mm512_load_pd(&q[nb*ldq]);
x1 = _mm512_FMA_pd(q1, h1, x1);
q2 = _mm512_load_pd(&q[(nb*ldq)+8]);
x2 = _mm512_FMA_pd(q2, h1, x2);
q3 = _mm512_load_pd(&q[(nb*ldq)+16]);
x3 = _mm512_FMA_pd(q3, h1, x3);
q4 = _mm512_load_pd(&q[(nb*ldq)+24]);
x4 = _mm512_FMA_pd(q4, h1, x4);
/////////////////////////////////////////////////////
// Rank-2 update of Q [24 x nb+1]
/////////////////////////////////////////////////////
__m512d tau1 = _mm512_set1_pd(hh[0]);
__m512d tau2 = _mm512_set1_pd(hh[ldh]);
__m512d vs = _mm512_set1_pd(s);
h1 = (__m512d) _mm512_xor_epi64((__m512i) tau1, (__m512i) sign);
x1 = _mm512_mul_pd(x1, h1);
x2 = _mm512_mul_pd(x2, h1);
x3 = _mm512_mul_pd(x3, h1);
x4 = _mm512_mul_pd(x4, h1);
h1 = (__m512d) _mm512_xor_si512((__m512i) tau2, (__m512i) sign);
h2 = _mm512_mul_pd(h1, vs);
y1 = _mm512_FMA_pd(y1, h1, _mm512_mul_pd(x1,h2));
y2 = _mm512_FMA_pd(y2, h1, _mm512_mul_pd(x2,h2));
y3 = _mm512_FMA_pd(y3, h1, _mm512_mul_pd(x3,h2));
y4 = _mm512_FMA_pd(y4, h1, _mm512_mul_pd(x4,h2));
q1 = _mm512_load_pd(q);
q1 = _mm512_add_pd(q1, y1);
_mm512_store_pd(q,q1);
q2 = _mm512_load_pd(&q[8]);
q2 = _mm512_add_pd(q2, y2);
_mm512_store_pd(&q[8],q2);
q3 = _mm512_load_pd(&q[16]);
q3 = _mm512_add_pd(q3, y3);
_mm512_store_pd(&q[16],q3);
q4 = _mm512_load_pd(&q[24]);
q4 = _mm512_add_pd(q4, y4);
_mm512_store_pd(&q[24],q4);
h2 = _mm512_set1_pd(hh[ldh+1]);
q1 = _mm512_load_pd(&q[ldq]);
q1 = _mm512_add_pd(q1, _mm512_FMA_pd(y1, h2, x1));
_mm512_store_pd(&q[ldq],q1);
q2 = _mm512_load_pd(&q[ldq+8]);
q2 = _mm512_add_pd(q2, _mm512_FMA_pd(y2, h2, x2));
_mm512_store_pd(&q[ldq+8],q2);
q3 = _mm512_load_pd(&q[ldq+16]);
q3 = _mm512_add_pd(q3, _mm512_FMA_pd(y3, h2, x3));
_mm512_store_pd(&q[ldq+16],q3);
q4 = _mm512_load_pd(&q[ldq+24]);
q4 = _mm512_add_pd(q4, _mm512_FMA_pd(y4, h2, x4));
_mm512_store_pd(&q[ldq+24],q4);
for (i = 2; i < nb; i++)
{
hh_trafo_kernel_4_AVX512_2hv_double(&q[i], hh, nb, ldq, ldh, s);
h1 = _mm512_set1_pd(hh[i-1]);
h2 = _mm512_set1_pd(hh[ldh+i]);
q1 = _mm512_load_pd(&q[i*ldq]);
q1 = _mm512_FMA_pd(x1, h1, q1);
q1 = _mm512_FMA_pd(y1, h2, q1);
_mm512_store_pd(&q[i*ldq],q1);
q2 = _mm512_load_pd(&q[(i*ldq)+8]);
q2 = _mm512_FMA_pd(x2, h1, q2);
q2 = _mm512_FMA_pd(y2, h2, q2);
_mm512_store_pd(&q[(i*ldq)+8],q2);
q3 = _mm512_load_pd(&q[(i*ldq)+16]);
q3 = _mm512_FMA_pd(x3, h1, q3);
q3 = _mm512_FMA_pd(y3, h2, q3);
_mm512_store_pd(&q[(i*ldq)+16],q3);
q4 = _mm512_load_pd(&q[(i*ldq)+24]);
q4 = _mm512_FMA_pd(x4, h1, q4);
q4 = _mm512_FMA_pd(y4, h2, q4);
_mm512_store_pd(&q[(i*ldq)+24],q4);
}
h1 = _mm512_set1_pd(hh[nb-1]);
q1 = _mm512_load_pd(&q[nb*ldq]);
q1 = _mm512_FMA_pd(x1, h1, q1);
_mm512_store_pd(&q[nb*ldq],q1);
q2 = _mm512_load_pd(&q[(nb*ldq)+8]);
q2 = _mm512_FMA_pd(x2, h1, q2);
_mm512_store_pd(&q[(nb*ldq)+8],q2);
q3 = _mm512_load_pd(&q[(nb*ldq)+16]);
q3 = _mm512_FMA_pd(x3, h1, q3);
_mm512_store_pd(&q[(nb*ldq)+16],q3);
q4 = _mm512_load_pd(&q[(nb*ldq)+24]);
q4 = _mm512_FMA_pd(x4, h1, q4);
_mm512_store_pd(&q[(nb*ldq)+24],q4);
}
/**
* Unrolled kernel that computes
* 24 rows of Q simultaneously, a
......@@ -167,304 +308,122 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
/////////////////////////////////////////////////////
int i;
// Needed bit mask for floating point sign flip
__m256d sign = (__m256d)_mm256_set1_epi64x(0x8000000000000000);
__m256d x1 = _mm256_load_pd(&q[ldq]);
__m256d x2 = _mm256_load_pd(&q[ldq+4]);
__m256d x3 = _mm256_load_pd(&q[ldq+8]);
__m256d x4 = _mm256_load_pd(&q[ldq+12]);
__m256d x5 = _mm256_load_pd(&q[ldq+16]);
__m256d x6 = _mm256_load_pd(&q[ldq+20]);
__m256d h1 = _mm256_broadcast_sd(&hh[ldh+1]);
__m256d h2;
#ifdef __ELPA_USE_FMA__
__m256d q1 = _mm256_load_pd(q);
__m256d y1 = _mm256_FMA_pd(x1, h1, q1);
__m256d q2 = _mm256_load_pd(&q[4]);
__m256d y2 = _mm256_FMA_pd(x2, h1, q2);
__m256d q3 = _mm256_load_pd(&q[8]);
__m256d y3 = _mm256_FMA_pd(x3, h1, q3);
__m256d q4 = _mm256_load_pd(&q[12]);
__m256d y4 = _mm256_FMA_pd(x4, h1, q4);
__m256d q5 = _mm256_load_pd(&q[16]);
__m256d y5 = _mm256_FMA_pd(x5, h1, q5);
__m256d q6 = _mm256_load_pd(&q[20]);
__m256d y6 = _mm256_FMA_pd(x6, h1, q6);
#else
__m256d q1 = _mm256_load_pd(q);
__m256d y1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
__m256d q2 = _mm256_load_pd(&q[4]);
__m256d y2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1));
__m256d q3 = _mm256_load_pd(&q[8]);
__m256d y3 = _mm256_add_pd(q3, _mm256_mul_pd(x3, h1));
__m256d q4 = _mm256_load_pd(&q[12]);
__m256d y4 = _mm256_add_pd(q4, _mm256_mul_pd(x4, h1));
__m256d q5 = _mm256_load_pd(&q[16]);
__m256d y5 = _mm256_add_pd(q5, _mm256_mul_pd(x5, h1));
__m256d q6 = _mm256_load_pd(&q[20]);
__m256d y6 = _mm256_add_pd(q6, _mm256_mul_pd(x6, h1));
#endif
// carefull here
__m512d sign = (__m512d)_mm512_set1_epi64(0x8000000000000000);
__m512d x1 = _mm512_load_pd(&q[ldq]);
__m512d x2 = _mm512_load_pd(&q[ldq+8]);
__m512d x3 = _mm512_load_pd(&q[ldq+16]);
// checkthis
__m512d h1 = _mm512_set1_pd(hh[ldh+1]);
__m512d h2;
__m512d q1 = _mm512_load_pd(q);
__m512d y1 = _mm512_FMA_pd(x1, h1, q1);
__m512d q2 = _mm512_load_pd(&q[8]);
__m512d y2 = _mm512_FMA_pd(x2, h1, q2);
__m512d q3 = _mm512_load_pd(&q[16]);
__m512d y3 = _mm512_FMA_pd(x3, h1, q3);
for(i = 2; i < nb; i++)
{
h1 = _mm256_broadcast_sd(&hh[i-1]);
h2 = _mm256_broadcast_sd(&hh[ldh+i]);
#ifdef __ELPA_USE_FMA__
q1 = _mm256_load_pd(&q[i*ldq]);
x1 = _mm256_FMA_pd(q1, h1, x1);
y1 = _mm256_FMA_pd(q1, h2, y1);
q2 = _mm256_load_pd(&q[(i*ldq)+4]);
x2 = _mm256_FMA_pd(q2, h1, x2);
y2 = _mm256_FMA_pd(q2, h2, y2);
q3 = _mm256_load_pd(&q[(i*ldq)+8]);
x3 = _mm256_FMA_pd(q3, h1, x3);
y3 = _mm256_FMA_pd(q3, h2, y3);
q4 = _mm256_load_pd(&q[(i*ldq)+12]);
x4 = _mm256_FMA_pd(q4, h1, x4);
y4 = _mm256_FMA_pd(q4, h2, y4);
q5 = _mm256_load_pd(&q[(i*ldq)+16]);
x5 = _mm256_FMA_pd(q5, h1, x5);
y5 = _mm256_FMA_pd(q5, h2, y5);
q6 = _mm256_load_pd(&q[(i*ldq)+20]);
x6 = _mm256_FMA_pd(q6, h1, x6);
y6 = _mm256_FMA_pd(q6, h2, y6);
#else
q1 = _mm256_load_pd(&q[i*ldq]);
x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
q2 = _mm256_load_pd(&q[(i*ldq)+4]);
x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
q3 = _mm256_load_pd(&q[(i*ldq)+8]);
x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
y3 = _mm256_add_pd(y3, _mm256_mul_pd(q3,h2));
q4 = _mm256_load_pd(&q[(i*ldq)+12]);
x4 = _mm256_add_pd(x4, _mm256_mul_pd(q4,h1));
y4 = _mm256_add_pd(y4, _mm256_mul_pd(q4,h2));
q5 = _mm256_load_pd(&q[(i*ldq)+16]);
x5 = _mm256_add_pd(x5, _mm256_mul_pd(q5,h1));
y5 = _mm256_add_pd(y5, _mm256_mul_pd(q5,h2));
q6 = _mm256_load_pd(&q[(i*ldq)+20]);
x6 = _mm256_add_pd(x6, _mm256_mul_pd(q6,h1));
y6 = _mm256_add_pd(y6, _mm256_mul_pd(q6,h2));
#endif
h1 = _mm512_set1_pd(hh[i-1]);
h2 = _mm512_set1_pd(hh[ldh+i]);
q1 = _mm512_load_pd(&q[i*ldq]);
x1 = _mm512_FMA_pd(q1, h1, x1);
y1 = _mm512_FMA_pd(q1, h2, y1);
q2 = _mm512_load_pd(&q[(i*ldq)+8]);
x2 = _mm512_FMA_pd(q2, h1, x2);
y2 = _mm512_FMA_pd(q2, h2, y2);
q3 = _mm512_load_pd(&q[(i*ldq)+16]);
x3 = _mm512_FMA_pd(q3, h1, x3);
y3 = _mm512_FMA_pd(q3, h2, y3);
}
h1 = _mm256_broadcast_sd(&hh[nb-1]);
#ifdef __ELPA_USE_FMA__
q1 = _mm256_load_pd(&q[nb*ldq]);
x1 = _mm256_FMA_pd(q1, h1, x1);
q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
x2 = _mm256_FMA_pd(q2, h1, x2);
q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
x3 = _mm256_FMA_pd(q3, h1, x3);
q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
x4 = _mm256_FMA_pd(q4, h1, x4);
q5 = _mm256_load_pd(&q[(nb*ldq)+16]);
x5 = _mm256_FMA_pd(q5, h1, x5);
q6 = _mm256_load_pd(&q[(nb*ldq)+20]);
x6 = _mm256_FMA_pd(q6, h1, x6);
#else
q1 = _mm256_load_pd(&q[nb*ldq]);
x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
x4 = _mm256_add_pd(x4, _mm256_mul_pd(q4,h1));
q5 = _mm256_load_pd(&q[(nb*ldq)+16]);
x5 = _mm256_add_pd(x5, _mm256_mul_pd(q5,h1));
q6 = _mm256_load_pd(&q[(nb*ldq)+20]);
x6 = _mm256_add_pd(x6, _mm256_mul_pd(q6,h1));
#endif
h1 = _mm512_set1_pd(hh[nb-1]);
q1 = _mm512_load_pd(&q[nb*ldq]);
x1 = _mm512_FMA_pd(q1, h1, x1);
q2 = _mm512_load_pd(&q[(nb*ldq)+8]);
x2 = _mm512_FMA_pd(q2, h1, x2);
q3 = _mm512_load_pd(&q[(nb*ldq)+16]);
x3 = _mm512_FMA_pd(q3, h1, x3);
/////////////////////////////////////////////////////
// Rank-2 update of Q [24 x nb+1]
/////////////////////////////////////////////////////
__m256d tau1 = _mm256_broadcast_sd(hh);
__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
__m256d vs = _mm256_broadcast_sd(&s);
h1 = _mm256_xor_pd(tau1, sign);
x1 = _mm256_mul_pd(x1, h1);
x2 = _mm256_mul_pd(x2, h1);
x3 = _mm256_mul_pd(x3, h1);
x4 = _mm256_mul_pd(x4, h1);
x5 = _mm256_mul_pd(x5, h1);
x6 = _mm256_mul_pd(x6, h1);
h1 = _mm256_xor_pd(tau2, sign);
h2 = _mm256_mul_pd(h1, vs);
#ifdef __ELPA_USE_FMA__
y1 = _mm256_FMA_pd(y1, h1, _mm256_mul_pd(x1,h2));
y2 = _mm256_FMA_pd(y2, h1, _mm256_mul_pd(x2,h2));
y3 = _mm256_FMA_pd(y3, h1, _mm256_mul_pd(x3,h2));
y4 = _mm256_FMA_pd(y4, h1, _mm256_mul_pd(x4,h2));
y5 = _mm256_FMA_pd(y5, h1, _mm256_mul_pd(x5,h2));
y6 = _mm256_FMA_pd(y6, h1, _mm256_mul_pd(x6,h2));
#else
y1 = _mm256_add_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2));
y2 = _mm256_add_pd(_mm256_mul_pd(y2,h1), _mm256_mul_pd(x2,h2));
y3 = _mm256_add_pd(_mm256_mul_pd(y3,h1), _mm256_mul_pd(x3,h2));
y4 = _mm256_add_pd(_mm256_mul_pd(y4,h1), _mm256_mul_pd(x4,h2));
y5 = _mm256_add_pd(_mm256_mul_pd(y5,h1), _mm256_mul_pd(x5,h2));
y6 = _mm256_add_pd(_mm256_mul_pd(y6,h1), _mm256_mul_pd(x6,h2));
#endif
q1 = _mm256_load_pd(q);
q1 = _mm256_add_pd(q1, y1);
_mm256_store_pd(q,q1);
q2 = _mm256_load_pd(&q[4]);
q2 = _mm256_add_pd(q2, y2);
_mm256_store_pd(&q[4],q2);
q3 = _mm256_load_pd(&q[8]);
q3 = _mm256_add_pd(q3, y3);
_mm256_store_pd(&q[8],q3);
q4 = _mm256_load_pd(&q[12]);
q4 = _mm256_add_pd(q4, y4);
_mm256_store_pd(&q[12],q4);
q5 = _mm256_load_pd(&q[16]);
q5 = _mm256_add_pd(q5, y5);
_mm256_store_pd(&q[16],q5);
q6 = _mm256_load_pd(&q[20]);
q6 = _mm256_add_pd(q6, y6);
_mm256_store_pd(&q[20],q6);
h2 = _mm256_broadcast_sd(&hh[ldh+1]);
#ifdef __ELPA_USE_FMA__
q1 = _mm256_load_pd(&q[ldq]);
q1 = _mm256_add_pd(q1, _mm256_FMA_pd(y1, h2, x1));
_mm256_store_pd(&q[ldq],q1);
q2 = _mm256_load_pd(&q[ldq+4]);
q2 = _mm256_add_pd(q2, _mm256_FMA_pd(y2, h2, x2));