Commit 1e1812bc authored by Andreas Marek's avatar Andreas Marek

Single precision kernel for AVX512 real block6

parent 5225392a
...@@ -2192,6 +2192,19 @@ intel-double-precision-mpi-noopenmp-ftimings-redirect-real-avx512_block6-complex ...@@ -2192,6 +2192,19 @@ intel-double-precision-mpi-noopenmp-ftimings-redirect-real-avx512_block6-complex
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1000 500 128' - make check TEST_FLAGS='1000 500 128'
intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx512_block6-complex-avx512_block1-kernel-jobs:
tags:
- KNL
script:
- ./autogen.sh
- ./configure FC=mpiifort CC=mpiicc CFLAGS="-O3 -mtune=knl -axMIC-AVX512" FCFLAGS="-O3 -mtune=knl -axMIC-AVX512" SCALAPACK_FCFLAGS="-L$MKLROOT/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" SCALAPACK_LDFLAGS="-L$MKLROOT/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKLROOT/lib/intel64" --with-real-avx512_block6-kernel-only --with-complex-avx512_block1-kernel-only --enable-single-precision
- /home/elpa/wait_until_midnight.sh
- make -j 8
- export OMP_NUM_THREADS=1
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1000 500 128'
intel-set-kernel-via-environment-variable-mpi-openmp-job: intel-set-kernel-via-environment-variable-mpi-openmp-job:
tags: tags:
- cpu - cpu
......
...@@ -204,9 +204,9 @@ endif ...@@ -204,9 +204,9 @@ endif
if WITH_REAL_AVX512_BLOCK6_KERNEL if WITH_REAL_AVX512_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_double_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_single_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_single_precision.c
#endif endif
endif endif
......
...@@ -495,7 +495,7 @@ __forceinline void hh_trafo_kernel_64_AVX512_4hv_single(float* q, float* hh, int ...@@ -495,7 +495,7 @@ __forceinline void hh_trafo_kernel_64_AVX512_4hv_single(float* q, float* hh, int
q3 = _mm512_NFMA_ps(x3, h1, q3); q3 = _mm512_NFMA_ps(x3, h1, q3);
q3 = _mm512_NFMA_ps(y3, h2, q3); q3 = _mm512_NFMA_ps(y3, h2, q3);
q3 = _mm512_NFMA_ps(z3, h3, q3); q3 = _mm512_NFMA_ps(z3, h3, q3);
q3 = _mm512_NFMA_pd(w3, h4, q3); q3 = _mm512_NFMA_ps(w3, h4, q3);
_mm512_store_ps(&q[(i*ldq)+32],q3); _mm512_store_ps(&q[(i*ldq)+32],q3);
q4 = _mm512_load_ps(&q[(i*ldq)+48]); q4 = _mm512_load_ps(&q[(i*ldq)+48]);
...@@ -932,7 +932,7 @@ __forceinline void hh_trafo_kernel_48_AVX512_4hv_single(float* q, float* hh, int ...@@ -932,7 +932,7 @@ __forceinline void hh_trafo_kernel_48_AVX512_4hv_single(float* q, float* hh, int
q3 = _mm512_NFMA_ps(x3, h1, q3); q3 = _mm512_NFMA_ps(x3, h1, q3);
q3 = _mm512_NFMA_ps(y3, h2, q3); q3 = _mm512_NFMA_ps(y3, h2, q3);
q3 = _mm512_NFMA_ps(z3, h3, q3); q3 = _mm512_NFMA_ps(z3, h3, q3);
q3 = _mm512_NFMA_pd(w3, h4, q3); q3 = _mm512_NFMA_ps(w3, h4, q3);
_mm512_store_ps(&q[(i*ldq)+32],q3); _mm512_store_ps(&q[(i*ldq)+32],q3);
// q4 = _mm512_load_ps(&q[(i*ldq)+48]); // q4 = _mm512_load_ps(&q[(i*ldq)+48]);
...@@ -1369,7 +1369,7 @@ __forceinline void hh_trafo_kernel_32_AVX512_4hv_single(float* q, float* hh, int ...@@ -1369,7 +1369,7 @@ __forceinline void hh_trafo_kernel_32_AVX512_4hv_single(float* q, float* hh, int
// q3 = _mm512_NFMA_ps(x3, h1, q3); // q3 = _mm512_NFMA_ps(x3, h1, q3);
// q3 = _mm512_NFMA_ps(y3, h2, q3); // q3 = _mm512_NFMA_ps(y3, h2, q3);
// q3 = _mm512_NFMA_ps(z3, h3, q3); // q3 = _mm512_NFMA_ps(z3, h3, q3);
// q3 = _mm512_NFMA_pd(w3, h4, q3); // q3 = _mm512_NFMA_ps(w3, h4, q3);
// _mm512_store_ps(&q[(i*ldq)+32],q3); // _mm512_store_ps(&q[(i*ldq)+32],q3);
// q4 = _mm512_load_ps(&q[(i*ldq)+48]); // q4 = _mm512_load_ps(&q[(i*ldq)+48]);
...@@ -1806,7 +1806,7 @@ __forceinline void hh_trafo_kernel_16_AVX512_4hv_single(float* q, float* hh, int ...@@ -1806,7 +1806,7 @@ __forceinline void hh_trafo_kernel_16_AVX512_4hv_single(float* q, float* hh, int
// q3 = _mm512_NFMA_ps(x3, h1, q3); // q3 = _mm512_NFMA_ps(x3, h1, q3);
// q3 = _mm512_NFMA_ps(y3, h2, q3); // q3 = _mm512_NFMA_ps(y3, h2, q3);
// q3 = _mm512_NFMA_ps(z3, h3, q3); // q3 = _mm512_NFMA_ps(z3, h3, q3);
// q3 = _mm512_NFMA_pd(w3, h4, q3); // q3 = _mm512_NFMA_ps(w3, h4, q3);
// _mm512_store_ps(&q[(i*ldq)+32],q3); // _mm512_store_ps(&q[(i*ldq)+32],q3);
// q4 = _mm512_load_ps(&q[(i*ldq)+48]); // q4 = _mm512_load_ps(&q[(i*ldq)+48]);
......
...@@ -42,47 +42,44 @@ ...@@ -42,47 +42,44 @@
// any derivatives of ELPA under the same license that we chose for // any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License. // the original distribution, the GNU Lesser General Public License.
// //
// Author: Andreas Marek, MPCDF, based on the double precision case of A. Heinecke // Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
// // --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include "config-f90.h"
#include <x86intrin.h> #include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) static #define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_AVX2 #ifdef HAVE_AVX512
#ifdef __FMA4__
#define __ELPA_USE_FMA__ #define __ELPA_USE_FMA__
#define _mm256_FMA_ps(a,b,c) _mm256_macc_ps(a,b,c) #define _mm512_FMA_ps(a,b,c) _mm512_fmadd_ps(a,b,c)
#define _mm256_NFMA_ps(a,b,c) _mm256_nmacc_ps(a,b,c) #define _mm512_NFMA_ps(a,b,c) _mm512_fnmadd_ps(a,b,c)
#define _mm256_FMSUB_ps(a,b,c) _mm256_msub_ps(a,b,c) #define _mm512_FMSUB_ps(a,b,c) _mm512_fmsub_ps(a,b,c)
#endif
#ifdef __AVX2__
#define __ELPA_USE_FMA__
#define _mm256_FMA_ps(a,b,c) _mm256_fmadd_ps(a,b,c)
#define _mm256_NFMA_ps(a,b,c) _mm256_fnmadd_ps(a,b,c)
#define _mm256_FMSUB_ps(a,b,c) _mm256_fmsub_ps(a,b,c)
#endif #endif
#endif
//Forward declaration //Forward declaration
static void hh_trafo_kernel_4_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods); //static void hh_trafo_kernel_4_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_8_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
void hexa_hh_trafo_real_avx512_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh); //static void hh_trafo_kernel_8_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_16_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
//static void hh_trafo_kernel_24_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_32_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_48_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_64_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
void hexa_hh_trafo_real_avx512_6hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
/* /*
!f>#ifdef HAVE_AVX512 !f>#if defined(HAVE_AVX512)
!f> interface !f> interface
!f> subroutine hexa_hh_trafo_real_avx512_6hv_single(q, hh, pnb, pnq, pldq, pldh) & !f> subroutine hexa_hh_trafo_real_avx512_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_512_6hv_single") !f> bind(C, name="hexa_hh_trafo_real_avx512_6hv_single")
!f> use, intrinsic :: iso_c_binding !f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q !f> type(c_ptr), value :: q
!f> real(kind=c_float) :: hh(pnb,6) !f> real(kind=c_float) :: hh(pnb,6)
!f> end subroutine !f> end subroutine
!f> end interface !f> end interface
!f>#endif !f>#endif
...@@ -195,44 +192,39 @@ void hexa_hh_trafo_real_avx512_6hv_single(float* q, float* hh, int* pnb, int* pn ...@@ -195,44 +192,39 @@ void hexa_hh_trafo_real_avx512_6hv_single(float* q, float* hh, int* pnb, int* pn
scalarprods[10] += hh[i-5] * hh[i+(ldh*5)]; scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
} }
// Production level kernel calls with padding // Production level kernel calls with padding
#ifdef __AVX__ for (i = 0; i < nq-48; i+=64)
for (i = 0; i < nq-4; i+=8)
{ {
hh_trafo_kernel_8_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); hh_trafo_kernel_64_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
} }
if (nq == i) if (nq == i)
{ {
return; return;
} }
else if (nq-i == 48)
{
hh_trafo_kernel_4_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
}
#else
for (i = 0; i < nq-2; i+=4)
{ {
hh_trafo_kernel_4_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); hh_trafo_kernel_48_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
} }
if (nq == i) if (nq-i == 32)
{ {
return; hh_trafo_kernel_32_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
} }
else else
{ {
hh_trafo_kernel_2_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods); hh_trafo_kernel_16_AVX512_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
} }
#endif
} }
/** /**
* Unrolled kernel that computes * Unrolled kernel that computes
* 8 rows of Q simultaneously, a * 64 rows of Q simultaneously, a
* matrix vector product with two householder * matrix vector product with two householder
* vectors + a rank 1 update is performed * vectors + a rank 1 update is performed
*/ */
__forceinline void hh_trafo_kernel_8_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods) __forceinline void hh_trafo_kernel_64_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
{ {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [8 x nb+3] * hh // Matrix Vector Multiplication, Q [8 x nb+3] * hh
...@@ -240,772 +232,868 @@ __forceinline void hh_trafo_kernel_8_AVX512_6hv_single(float* q, float* hh, int ...@@ -240,772 +232,868 @@ __forceinline void hh_trafo_kernel_8_AVX512_6hv_single(float* q, float* hh, int
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
int i; int i;
__m256 a1_1 = _mm256_load_ps(&q[ldq*5]); __m512 a1_1 = _mm512_load_ps(&q[ldq*5]);
__m256 a2_1 = _mm256_load_ps(&q[ldq*4]); __m512 a2_1 = _mm512_load_ps(&q[ldq*4]);
__m256 a3_1 = _mm256_load_ps(&q[ldq*3]); __m512 a3_1 = _mm512_load_ps(&q[ldq*3]);
__m256 a4_1 = _mm256_load_ps(&q[ldq*2]); __m512 a4_1 = _mm512_load_ps(&q[ldq*2]);
__m256 a5_1 = _mm256_load_ps(&q[ldq]); __m512 a5_1 = _mm512_load_ps(&q[ldq]);
__m256 a6_1 = _mm256_load_ps(&q[0]); // q(1,1) | q(2,1) | q(3,1) | q(4,1) __m512 a6_1 = _mm512_load_ps(&q[0]);
__m256 h_6_5 = _mm256_broadcast_ss(&hh[(ldh*5)+1]); __m512 h_6_5 = _mm512_set1_ps(hh[(ldh*5)+1]);
__m256 h_6_4 = _mm256_broadcast_ss(&hh[(ldh*5)+2]); __m512 h_6_4 = _mm512_set1_ps(hh[(ldh*5)+2]);
__m256 h_6_3 = _mm256_broadcast_ss(&hh[(ldh*5)+3]); __m512 h_6_3 = _mm512_set1_ps(hh[(ldh*5)+3]);
__m256 h_6_2 = _mm256_broadcast_ss(&hh[(ldh*5)+4]); __m512 h_6_2 = _mm512_set1_ps(hh[(ldh*5)+4]);
__m256 h_6_1 = _mm256_broadcast_ss(&hh[(ldh*5)+5]); __m512 h_6_1 = _mm512_set1_ps(hh[(ldh*5)+5]);
#ifdef __ELPA_USE_FMA__
register __m256 t1 = _mm256_FMA_ps(a5_1, h_6_5, a6_1); // register __m512d t1 = _mm512_FMA_ps(a5_1, h_6_5, a6_1);
t1 = _mm256_FMA_ps(a4_1, h_6_4, t1); __m512 t1 = _mm512_FMA_ps(a5_1, h_6_5, a6_1);
t1 = _mm256_FMA_ps(a3_1, h_6_3, t1);
t1 = _mm256_FMA_ps(a2_1, h_6_2, t1); t1 = _mm512_FMA_ps(a4_1, h_6_4, t1);
t1 = _mm256_FMA_ps(a1_1, h_6_1, t1); t1 = _mm512_FMA_ps(a3_1, h_6_3, t1);
#else t1 = _mm512_FMA_ps(a2_1, h_6_2, t1);
register __m256 t1 = _mm256_add_ps(a6_1, _mm256_mul_ps(a5_1, h_6_5)); t1 = _mm512_FMA_ps(a1_1, h_6_1, t1);
t1 = _mm256_add_ps(t1, _mm256_mul_ps(a4_1, h_6_4));
t1 = _mm256_add_ps(t1, _mm256_mul_ps(a3_1, h_6_3)); __m512 h_5_4 = _mm512_set1_ps(hh[(ldh*4)+1]);
t1 = _mm256_add_ps(t1, _mm256_mul_ps(a2_1, h_6_2)); __m512 h_5_3 = _mm512_set1_ps(hh[(ldh*4)+2]);
t1 = _mm256_add_ps(t1, _mm256_mul_ps(a1_1, h_6_1)); __m512 h_5_2 = _mm512_set1_ps(hh[(ldh*4)+3]);
#endif __m512 h_5_1 = _mm512_set1_ps(hh[(ldh*4)+4]);
__m256 h_5_4 = _mm256_broadcast_ss(&hh[(ldh*4)+1]);
__m256 h_5_3 = _mm256_broadcast_ss(&hh[(ldh*4)+2]); // register __m512d v1 = _mm512_FMA_ps(a4_1, h_5_4, a5_1);
__m256 h_5_2 = _mm256_broadcast_ss(&hh[(ldh*4)+3]); __m512 v1 = _mm512_FMA_ps(a4_1, h_5_4, a5_1);
__m256 h_5_1 = _mm256_broadcast_ss(&hh[(ldh*4)+4]);
#ifdef __ELPA_USE_FMA__ v1 = _mm512_FMA_ps(a3_1, h_5_3, v1);
register __m256 v1 = _mm256_FMA_ps(a4_1, h_5_4, a5_1); v1 = _mm512_FMA_ps(a2_1, h_5_2, v1);
v1 = _mm256_FMA_ps(a3_1, h_5_3, v1); v1 = _mm512_FMA_ps(a1_1, h_5_1, v1);
v1 = _mm256_FMA_ps(a2_1, h_5_2, v1);
v1 = _mm256_FMA_ps(a1_1, h_5_1, v1); __m512 h_4_3 = _mm512_set1_ps(hh[(ldh*3)+1]);
#else __m512 h_4_2 = _mm512_set1_ps(hh[(ldh*3)+2]);
register __m256 v1 = _mm256_add_ps(a5_1, _mm256_mul_ps(a4_1, h_5_4)); __m512 h_4_1 = _mm512_set1_ps(hh[(ldh*3)+3]);
v1 = _mm256_add_ps(v1, _mm256_mul_ps(a3_1, h_5_3));
v1 = _mm256_add_ps(v1, _mm256_mul_ps(a2_1, h_5_2)); // register __m512d w1 = _mm512_FMA_ps(a3_1, h_4_3, a4_1);
v1 = _mm256_add_ps(v1, _mm256_mul_ps(a1_1, h_5_1)); __m512 w1 = _mm512_FMA_ps(a3_1, h_4_3, a4_1);
#endif
__m256 h_4_3 = _mm256_broadcast_ss(&hh[(ldh*3)+1]); w1 = _mm512_FMA_ps(a2_1, h_4_2, w1);
__m256 h_4_2 = _mm256_broadcast_ss(&hh[(ldh*3)+2]); w1 = _mm512_FMA_ps(a1_1, h_4_1, w1);
__m256 h_4_1 = _mm256_broadcast_ss(&hh[(ldh*3)+3]);
#ifdef __ELPA_USE_FMA__ __m512 h_2_1 = _mm512_set1_ps(hh[ldh+1]);
register __m256 w1 = _mm256_FMA_ps(a3_1, h_4_3, a4_1); __m512 h_3_2 = _mm512_set1_ps(hh[(ldh*2)+1]);
w1 = _mm256_FMA_ps(a2_1, h_4_2, w1); __m512 h_3_1 = _mm512_set1_ps(hh[(ldh*2)+2]);
w1 = _mm256_FMA_ps(a1_1, h_4_1, w1);
#else // register __m512d z1 = _mm512_FMA_ps(a2_1, h_3_2, a3_1);
register __m256 w1 = _mm256_add_ps(a4_1, _mm256_mul_ps(a3_1, h_4_3)); __m512 z1 = _mm512_FMA_ps(a2_1, h_3_2, a3_1);
w1 = _mm256_add_ps(w1, _mm256_mul_ps(a2_1, h_4_2));
w1 = _mm256_add_ps(w1, _mm256_mul_ps(a1_1, h_4_1)); z1 = _mm512_FMA_ps(a1_1, h_3_1, z1);
#endif // register __m512d y1 = _mm512_FMA_ps(a1_1, h_2_1, a2_1);
__m256 h_2_1 = _mm256_broadcast_ss(&hh[ldh+1]); __m512 y1 = _mm512_FMA_ps(a1_1, h_2_1, a2_1);
__m256 h_3_2 = _mm256_broadcast_ss(&hh[(ldh*2)+1]);
__m256 h_3_1 = _mm256_broadcast_ss(&hh[(ldh*2)+2]);
#ifdef __ELPA_USE_FMA__ // register __m512d x1 = a1_1;
register __m256 z1 = _mm256_FMA_ps(a2_1, h_3_2, a3_1); __m512 x1 = a1_1;
z1 = _mm256_FMA_ps(a1_1, h_3_1, z1);
register __m256 y1 = _mm256_FMA_ps(a1_1, h_2_1, a2_1);
#else
register __m256 z1 = _mm256_add_ps(a3_1, _mm256_mul_ps(a2_1, h_3_2)); __m512 a1_2 = _mm512_load_ps(&q[(ldq*5)+16]);
z1 = _mm256_add_ps(z1, _mm256_mul_ps(a1_1, h_3_1)); __m512 a2_2 = _mm512_load_ps(&q[(ldq*4)+16]);
register __m256 y1 = _mm256_add_ps(a2_1, _mm256_mul_ps(a1_1, h_2_1)); __m512 a3_2 = _mm512_load_ps(&q[(ldq*3)+16]);
#endif __m512 a4_2 = _mm512_load_ps(&q[(ldq*2)+16]);
register __m256 x1 = a1_1; __m512 a5_2 = _mm512_load_ps(&q[(ldq)+16]);
__m512 a6_2 = _mm512_load_ps(&q[0+16]);
// __m256d a1_2 = _mm256_load_pd(&q[(ldq*5)+4]); // register __m512d t2 = _mm512_FMA_ps(a5_2, h_6_5, a6_2);
// __m256d a2_2 = _mm256_load_pd(&q[(ldq*4)+4]); __m512 t2 = _mm512_FMA_ps(a5_2, h_6_5, a6_2);
// __m256d a3_2 = _mm256_load_pd(&q[(ldq*3)+4]);
// __m256d a4_2 = _mm256_load_pd(&q[(ldq*2)+4]); t2 = _mm512_FMA_ps(a4_2, h_6_4, t2);
// __m256d a5_2 = _mm256_load_pd(&q[(ldq)+4]); t2 = _mm512_FMA_ps(a3_2, h_6_3, t2);
// __m256d a6_2 = _mm256_load_pd(&q[4]); t2 = _mm512_FMA_ps(a2_2, h_6_2, t2);
t2 = _mm512_FMA_ps(a1_2, h_6_1, t2);
#ifdef __ELPA_USE_FMA__
// register __m256d t2 = _mm256_FMA_pd(a5_2, h_6_5, a6_2); // register __m512d v2 = _mm512_FMA_ps(a4_2, h_5_4, a5_2);
// t2 = _mm256_FMA_pd(a4_2, h_6_4, t2); __m512 v2 = _mm512_FMA_ps(a4_2, h_5_4, a5_2);
// t2 = _mm256_FMA_pd(a3_2, h_6_3, t2);
// t2 = _mm256_FMA_pd(a2_2, h_6_2, t2); v2 = _mm512_FMA_ps(a3_2, h_5_3, v2);
// t2 = _mm256_FMA_pd(a1_2, h_6_1, t2); v2 = _mm512_FMA_ps(a2_2, h_5_2, v2);
// register __m256d v2 = _mm256_FMA_pd(a4_2, h_5_4, a5_2); v2 = _mm512_FMA_ps(a1_2, h_5_1, v2);
// v2 = _mm256_FMA_pd(a3_2, h_5_3, v2);
// v2 = _mm256_FMA_pd(a2_2, h_5_2, v2); // register __m512d w2 = _mm512_FMA_ps(a3_2, h_4_3, a4_2);
// v2 = _mm256_FMA_pd(a1_2, h_5_1, v2); __m512 w2 = _mm512_FMA_ps(a3_2, h_4_3, a4_2);
// register __m256d w2 = _mm256_FMA_pd(a3_2, h_4_3, a4_2);
// w2 = _mm256_FMA_pd(a2_2, h_4_2, w2); w2 = _mm512_FMA_ps(a2_2, h_4_2, w2);
// w2 = _mm256_FMA_pd(a1_2, h_4_1, w2); w2 = _mm512_FMA_ps(a1_2, h_4_1, w2);
// register __m256d z2 = _mm256_FMA_pd(a2_2, h_3_2, a3_2);
// z2 = _mm256_FMA_pd(a1_2, h_3_1, z2); // register __m512d z2 = _mm512_FMA_ps(a2_2, h_3_2, a3_2);
// register __m256d y2 = _mm256_FMA_pd(a1_2, h_2_1, a2_2); __m512 z2 = _mm512_FMA_ps(a2_2, h_3_2, a3_2);
#else
// register __m256d t2 = _mm256_add_pd(a6_2, _mm256_mul_pd(a5_2, h_6_5)); z2 = _mm512_FMA_ps(a1_2, h_3_1, z2);
// t2 = _mm256_add_pd(t2, _mm256_mul_pd(a4_2, h_6_4)); // register __m512d y2 = _mm512_FMA_ps(a1_2, h_2_1, a2_2);
// t2 = _mm256_add_pd(t2, _mm256_mul_pd(a3_2, h_6_3)); __m512 y2 = _mm512_FMA_ps(a1_2, h_2_1, a2_2);
// t2 = _mm256_add_pd(t2, _mm256_mul_pd(a2_2, h_6_2));
// t2 = _mm256_add_pd(t2, _mm256_mul_pd(a1_2, h_6_1));
// register __m256d v2 = _mm256_add_pd(a5_2, _mm256_mul_pd(a4_2, h_5_4)); // register __m512d x2 = a1_2;
// v2 = _mm256_add_pd(v2, _mm256_mul_pd(a3_2, h_5_3)); __m512 x2 = a1_2;
// v2 = _mm256_add_pd(v2, _mm256_mul_pd(a2_2, h_5_2));
// v2 = _mm256_add_pd(v2, _mm256_mul_pd(a1_2, h_5_1));
// register __m256d w2 = _mm256_add_pd(a4_2, _mm256_mul_pd(a3_2, h_4_3)); __m512 a1_3 = _mm512_load_ps(&q[(ldq*5)+32]);
// w2 = _mm256_add_pd(w2, _mm256_mul_pd(a2_2, h_4_2)); __m512 a2_3 = _mm512_load_ps(&q[(ldq*4)+32]);
// w2 = _mm256_add_pd(w2, _mm256_mul_pd(a1_2, h_4_1)); __m512 a3_3 = _mm512_load_ps(&q[(ldq*3)+32]);
// register __m256d z2 = _mm256_add_pd(a3_2, _mm256_mul_pd(a2_2, h_3_2)); __m512 a4_3 = _mm512_load_ps(&q[(ldq*2)+32]);
// z2 = _mm256_add_pd(z2, _mm256_mul_pd(a1_2, h_3_1)); __m512 a5_3 = _mm512_load_ps(&q[(ldq)+32]);
// register __m256d y2 = _mm256_add_pd(a2_2, _mm256_mul_pd(a1_2, h_2_1)); __m512 a6_3 = _mm512_load_ps(&q[0+32]);
#endif
// register __m256d x2 = a1_2; // register __m512d t3 = _mm512_FMA_ps(a5_3, h_6_5, a6_3);
__m512 t3 = _mm512_FMA_ps(a5_3, h_6_5, a6_3);
t3 = _mm512_FMA_ps(a4_3, h_6_4, t3);
t3 = _mm512_FMA_ps(a3_3, h_6_3, t3);
t3 = _mm512_FMA_ps(a2_3, h_6_2, t3);
t3 = _mm512_FMA_ps(a1_3, h_6_1, t3);
// register __m512d v3 = _mm512_FMA_ps(a4_3, h_5_4, a5_3);
__m512 v3 = _mm512_FMA_ps(a4_3, h_5_4, a5_3);
v3 = _mm512_FMA_ps(a3_3, h_5_3, v3);
v3 = _mm512_FMA_ps(a2_3, h_5_2, v3);
v3 = _mm512_FMA_ps(a1_3, h_5_1, v3);
// register __m512d w3 = _mm512_FMA_ps(a3_3, h_4_3, a4_3);
__m512 w3 = _mm512_FMA_ps(a3_3, h_4_3, a4_3);
w3 = _mm512_FMA_ps(a2_3, h_4_2, w3);
w3 = _mm512_FMA_ps(a1_3, h_4_1, w3);
// register __m512d z3 = _mm512_FMA_ps(a2_3, h_3_2, a3_3);
__m512 z3 = _mm512_FMA_ps(a2_3, h_3_2, a3_3);
z3 = _mm512_FMA_ps(a1_3, h_3_1, z3);
// register __m512d y3 = _mm512_FMA_ps(a1_3, h_2_1, a2_3);
__m512 y3 = _mm512_FMA_ps(a1_3, h_2_1, a2_3);
// register __m512d x3 = a1_3;
__m512 x3 = a1_3;
__m512 a1_4 = _mm512_load_ps(&q[(ldq*5)+48]);
__m512 a2_4 = _mm512_load_ps(&q[(ldq*4)+48]);
__m512 a3_4 = _mm512_load_ps(&q[(ldq*3)+48]);
__m512 a4_4 = _mm512_load_ps(&q[(ldq*2)+48]);
__m512 a5_4 = _mm512_load_ps(&q[(ldq)+48]);
__m512 a6_4 = _mm512_load_ps(&q[0+48]);
// register __m512d t4 = _mm512_FMA_ps(a5_4, h_6_5, a6_4);
__m512 t4 = _mm512_FMA_ps(a5_4, h_6_5, a6_4);
t4 = _mm512_FMA_ps(a4_4, h_6_4, t4);
t4 = _mm512_FMA_ps(a3_4, h_6_3, t4);
t4 = _mm512_FMA_ps(a2_4, h_6_2, t4);
t4 = _mm512_FMA_ps(a1_4, h_6_1, t4);
// register __m512d v4 = _mm512_FMA_ps(a4_4, h_5_4, a5_4);
__m512 v4 = _mm512_FMA_ps(a4_4, h_5_4, a5_4);
v4 = _mm512_FMA_ps(a3_4, h_5_3, v4);
v4 = _mm512_FMA_ps(a2_4, h_5_2, v4);
v4 = _mm512_FMA_ps(a1_4, h_5_1, v4);
// register __m512d w4 = _mm512_FMA_ps(a3_4, h_4_3, a4_4);
__m512 w4 = _mm512_FMA_ps(a3_4, h_4_3, a4_4);
__m256 q1; w4 = _mm512_FMA_ps(a2_4, h_4_2, w4);
// __m256d q2; w4 = _mm512_FMA_ps(a1_4, h_4_1, w4);
__m256 h1; // register __m512d z4 = _mm512_FMA_ps(a2_4, h_3_2, a3_4);
__m256 h2; __m512 z4 = _mm512_FMA_ps(a2_4, h_3_2, a3_4);
__m256 h3;
__m256 h4; z4 = _mm512_FMA_ps(a1_4, h_3_1, z4);
__m256 h5; // register __m512d y4 = _mm512_FMA_ps(a1_4, h_2_1, a2_4);
__m256 h6; __m512 y4 = _mm512_FMA_ps(a1_4, h_2_1, a2_4);
// register __m512d x4 = a1_4;
__m512 x4 = a1_4;
__m512 q1;
__m512 q2;
__m512 q3;
__m512 q4;
__m512 h1;
__m512 h2;
__m512 h3;
__m512 h4;
__m512 h5;
__m512 h6;
for(i = 6; i < nb; i++) for(i = 6; i < nb; i++)
{ {
h1 = _mm256_broadcast_ss(&hh[i-5]); h1 = _mm512_set1_ps(hh[i-5]);
q1 = _mm256_load_ps(&q[i*ldq]); q1 = _mm512_load_ps(&q[i*ldq]);
// q2 = _mm256_load_pd(&q[(i*ldq)+4]); q2 = _mm512_load_ps(&q[(i*ldq)+16]);
#ifdef __ELPA_USE_FMA__ q3 = _mm512_load_ps(&q[(i*ldq)+32]);
x1 = _mm256_FMA_ps(q1, h1, x1); q4 = _mm512_load_ps(&q[(i*ldq)+48]);
// x2 = _mm256_FMA_pd(q2, h1, x2);
#else x1 = _mm512_FMA_ps(q1, h1, x1);
x1 = _mm256_add_ps(x1, _mm256_mul_ps(q1,h1)); x2 = _mm512_FMA_ps(q2, h1, x2);
// x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); x3 = _mm512_FMA_ps(q3, h1, x3);
#endif x4 = _mm512_FMA_ps(q4, h1, x4);
h2 = _mm256_broadcast_ss(&hh[ldh+i-4]);
#ifdef __ELPA_USE_FMA__ h2 = _mm512_set1_ps(hh[ldh+i-4]);
y1 = _mm256_FMA_ps(q1, h2, y1);