Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
elpa
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
11
Issues
11
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Environments
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
elpa
elpa
Commits
1e1812bc
Commit
1e1812bc
authored
Aug 18, 2016
by
Andreas Marek
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Single precision kernel for AVX512 real block6
parent
5225392a
Changes
5
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
3491 additions
and
1366 deletions
+3491
-1366
.gitlab-ci.yml
.gitlab-ci.yml
+13
-0
Makefile.am
Makefile.am
+3
-3
src/elpa2_kernels/elpa2_kernels_real_avx512_4hv_single_precision.c
..._kernels/elpa2_kernels_real_avx512_4hv_single_precision.c
+4
-4
src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_single_precision.c
..._kernels/elpa2_kernels_real_avx512_6hv_single_precision.c
+3411
-1299
src/mod_compute_hh_trafo_real.F90
src/mod_compute_hh_trafo_real.F90
+60
-60
No files found.
.gitlab-ci.yml
View file @
1e1812bc
...
...
@@ -2192,6 +2192,19 @@ intel-double-precision-mpi-noopenmp-ftimings-redirect-real-avx512_block6-complex
-
export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
-
make check TEST_FLAGS='1000 500 128'
intel-single-precision-mpi-noopenmp-ftimings-redirect-real-avx512_block6-complex-avx512_block1-kernel-jobs
:
tags
:
-
KNL
script
:
-
./autogen.sh
-
./configure FC=mpiifort CC=mpiicc CFLAGS="-O3 -mtune=knl -axMIC-AVX512" FCFLAGS="-O3 -mtune=knl -axMIC-AVX512" SCALAPACK_FCFLAGS="-L$MKLROOT/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKLROOT/include/intel64/lp64" SCALAPACK_LDFLAGS="-L$MKLROOT/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKLROOT/lib/intel64" --with-real-avx512_block6-kernel-only --with-complex-avx512_block1-kernel-only --enable-single-precision
-
/home/elpa/wait_until_midnight.sh
-
make -j
8
-
export OMP_NUM_THREADS=1
-
export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
-
make check TEST_FLAGS='1000 500 128'
intel-set-kernel-via-environment-variable-mpi-openmp-job
:
tags
:
-
cpu
...
...
Makefile.am
View file @
1e1812bc
...
...
@@ -204,9 +204,9 @@ endif
if
WITH_REAL_AVX512_BLOCK6_KERNEL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_double_precision.c
#
if WANT_SINGLE_PRECISION_REAL
#
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_single_precision.c
#
endif
if
WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_single_precision.c
endif
endif
...
...
src/elpa2_kernels/elpa2_kernels_real_avx512_4hv_single_precision.c
View file @
1e1812bc
...
...
@@ -495,7 +495,7 @@ __forceinline void hh_trafo_kernel_64_AVX512_4hv_single(float* q, float* hh, int
q3
=
_mm512_NFMA_ps
(
x3
,
h1
,
q3
);
q3
=
_mm512_NFMA_ps
(
y3
,
h2
,
q3
);
q3
=
_mm512_NFMA_ps
(
z3
,
h3
,
q3
);
q3
=
_mm512_NFMA_p
d
(
w3
,
h4
,
q3
);
q3
=
_mm512_NFMA_p
s
(
w3
,
h4
,
q3
);
_mm512_store_ps
(
&
q
[(
i
*
ldq
)
+
32
],
q3
);
q4
=
_mm512_load_ps
(
&
q
[(
i
*
ldq
)
+
48
]);
...
...
@@ -932,7 +932,7 @@ __forceinline void hh_trafo_kernel_48_AVX512_4hv_single(float* q, float* hh, int
q3
=
_mm512_NFMA_ps
(
x3
,
h1
,
q3
);
q3
=
_mm512_NFMA_ps
(
y3
,
h2
,
q3
);
q3
=
_mm512_NFMA_ps
(
z3
,
h3
,
q3
);
q3
=
_mm512_NFMA_p
d
(
w3
,
h4
,
q3
);
q3
=
_mm512_NFMA_p
s
(
w3
,
h4
,
q3
);
_mm512_store_ps
(
&
q
[(
i
*
ldq
)
+
32
],
q3
);
// q4 = _mm512_load_ps(&q[(i*ldq)+48]);
...
...
@@ -1369,7 +1369,7 @@ __forceinline void hh_trafo_kernel_32_AVX512_4hv_single(float* q, float* hh, int
// q3 = _mm512_NFMA_ps(x3, h1, q3);
// q3 = _mm512_NFMA_ps(y3, h2, q3);
// q3 = _mm512_NFMA_ps(z3, h3, q3);
// q3 = _mm512_NFMA_p
d
(w3, h4, q3);
// q3 = _mm512_NFMA_p
s
(w3, h4, q3);
// _mm512_store_ps(&q[(i*ldq)+32],q3);
// q4 = _mm512_load_ps(&q[(i*ldq)+48]);
...
...
@@ -1806,7 +1806,7 @@ __forceinline void hh_trafo_kernel_16_AVX512_4hv_single(float* q, float* hh, int
// q3 = _mm512_NFMA_ps(x3, h1, q3);
// q3 = _mm512_NFMA_ps(y3, h2, q3);
// q3 = _mm512_NFMA_ps(z3, h3, q3);
// q3 = _mm512_NFMA_p
d
(w3, h4, q3);
// q3 = _mm512_NFMA_p
s
(w3, h4, q3);
// _mm512_store_ps(&q[(i*ldq)+32],q3);
// q4 = _mm512_load_ps(&q[(i*ldq)+48]);
...
...
src/elpa2_kernels/elpa2_kernels_real_avx512_6hv_single_precision.c
View file @
1e1812bc
...
...
@@ -42,47 +42,44 @@
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek, MPCDF, based on the double precision case of A. Heinecke
//
#include "config-f90.h"
// Author: Andreas Marek (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_AVX2
#ifdef HAVE_AVX
51
2
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMA_ps(a,b,c) _mm256_macc_ps(a,b,c)
#define _mm256_NFMA_ps(a,b,c) _mm256_nmacc_ps(a,b,c)
#define _mm256_FMSUB_ps(a,b,c) _mm256_msub_ps(a,b,c)
#endif
#define _mm512_FMA_ps(a,b,c) _mm512_fmadd_ps(a,b,c)
#define _mm512_NFMA_ps(a,b,c) _mm512_fnmadd_ps(a,b,c)
#define _mm512_FMSUB_ps(a,b,c) _mm512_fmsub_ps(a,b,c)
#ifdef __AVX2__
#define __ELPA_USE_FMA__
#define _mm256_FMA_ps(a,b,c) _mm256_fmadd_ps(a,b,c)
#define _mm256_NFMA_ps(a,b,c) _mm256_fnmadd_ps(a,b,c)
#define _mm256_FMSUB_ps(a,b,c) _mm256_fmsub_ps(a,b,c)
#endif
#endif
//Forward declaration
static
void
hh_trafo_kernel_4_AVX512_6hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
*
scalarprods
);
static
void
hh_trafo_kernel_8_AVX512_6hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
*
scalarprods
);
//static void hh_trafo_kernel_4_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
void
hexa_hh_trafo_real_avx512_6hv_single_
(
float
*
q
,
float
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
);
//static void hh_trafo_kernel_8_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static
void
hh_trafo_kernel_16_AVX512_6hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
*
scalarprods
);
//static void hh_trafo_kernel_24_AVX512_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static
void
hh_trafo_kernel_32_AVX512_6hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
*
scalarprods
);
static
void
hh_trafo_kernel_48_AVX512_6hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
*
scalarprods
);
static
void
hh_trafo_kernel_64_AVX512_6hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
*
scalarprods
);
void
hexa_hh_trafo_real_avx512_6hv_single
(
float
*
q
,
float
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
);
/*
!f>#if
def HAVE_AVX512
!f>#if
defined(HAVE_AVX512)
!f> interface
!f> subroutine hexa_hh_trafo_real_avx512_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_512_6hv_single")
!f> bind(C, name="hexa_hh_trafo_real_
avx
512_6hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_float)
:: hh(pnb,6)
!f> real(kind=c_float) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
...
...
@@ -195,44 +192,39 @@ void hexa_hh_trafo_real_avx512_6hv_single(float* q, float* hh, int* pnb, int* pn
scalarprods
[
10
]
+=
hh
[
i
-
5
]
*
hh
[
i
+
(
ldh
*
5
)];
}
// Production level kernel calls with padding
#ifdef __AVX__
for
(
i
=
0
;
i
<
nq
-
4
;
i
+=
8
)
for
(
i
=
0
;
i
<
nq
-
48
;
i
+=
64
)
{
hh_trafo_kernel_
8
_AVX512_6hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
scalarprods
);
hh_trafo_kernel_
64
_AVX512_6hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
scalarprods
);
}
if
(
nq
==
i
)
{
return
;
}
else
{
hh_trafo_kernel_4_AVX512_6hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
scalarprods
);
}
#else
for
(
i
=
0
;
i
<
nq
-
2
;
i
+=
4
)
if
(
nq
-
i
==
48
)
{
hh_trafo_kernel_4
_SSE
_6hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
scalarprods
);
hh_trafo_kernel_4
8_AVX512
_6hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
scalarprods
);
}
if
(
nq
==
i
)
if
(
nq
-
i
==
32
)
{
return
;
hh_trafo_kernel_32_AVX512_6hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
scalarprods
)
;
}
else
else
{
hh_trafo_kernel_
2_SSE
_6hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
scalarprods
);
hh_trafo_kernel_
16_AVX512
_6hv_single
(
&
q
[
i
],
hh
,
nb
,
ldq
,
ldh
,
scalarprods
);
}
#endif
}
/**
* Unrolled kernel that computes
*
8
rows of Q simultaneously, a
*
64
rows of Q simultaneously, a
* matrix vector product with two householder
* vectors + a rank 1 update is performed
*/
__forceinline
void
hh_trafo_kernel_
8
_AVX512_6hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
*
scalarprods
)
__forceinline
void
hh_trafo_kernel_
64
_AVX512_6hv_single
(
float
*
q
,
float
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
float
*
scalarprods
)
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [8 x nb+3] * hh
...
...
@@ -240,772 +232,868 @@ __forceinline void hh_trafo_kernel_8_AVX512_6hv_single(float* q, float* hh, int
/////////////////////////////////////////////////////
int
i
;
__m256
a1_1
=
_mm256_load_ps
(
&
q
[
ldq
*
5
]);
__m256
a2_1
=
_mm256_load_ps
(
&
q
[
ldq
*
4
]);
__m256
a3_1
=
_mm256_load_ps
(
&
q
[
ldq
*
3
]);
__m256
a4_1
=
_mm256_load_ps
(
&
q
[
ldq
*
2
]);
__m256
a5_1
=
_mm256_load_ps
(
&
q
[
ldq
]);
__m256
a6_1
=
_mm256_load_ps
(
&
q
[
0
]);
// q(1,1) | q(2,1) | q(3,1) | q(4,1)
__m256
h_6_5
=
_mm256_broadcast_ss
(
&
hh
[(
ldh
*
5
)
+
1
]);
__m256
h_6_4
=
_mm256_broadcast_ss
(
&
hh
[(
ldh
*
5
)
+
2
]);
__m256
h_6_3
=
_mm256_broadcast_ss
(
&
hh
[(
ldh
*
5
)
+
3
]);
__m256
h_6_2
=
_mm256_broadcast_ss
(
&
hh
[(
ldh
*
5
)
+
4
]);
__m256
h_6_1
=
_mm256_broadcast_ss
(
&
hh
[(
ldh
*
5
)
+
5
]);
#ifdef __ELPA_USE_FMA__
register
__m256
t1
=
_mm256_FMA_ps
(
a5_1
,
h_6_5
,
a6_1
);
t1
=
_mm256_FMA_ps
(
a4_1
,
h_6_4
,
t1
);
t1
=
_mm256_FMA_ps
(
a3_1
,
h_6_3
,
t1
);
t1
=
_mm256_FMA_ps
(
a2_1
,
h_6_2
,
t1
);
t1
=
_mm256_FMA_ps
(
a1_1
,
h_6_1
,
t1
);
#else
register
__m256
t1
=
_mm256_add_ps
(
a6_1
,
_mm256_mul_ps
(
a5_1
,
h_6_5
));
t1
=
_mm256_add_ps
(
t1
,
_mm256_mul_ps
(
a4_1
,
h_6_4
));
t1
=
_mm256_add_ps
(
t1
,
_mm256_mul_ps
(
a3_1
,
h_6_3
));
t1
=
_mm256_add_ps
(
t1
,
_mm256_mul_ps
(
a2_1
,
h_6_2
));
t1
=
_mm256_add_ps
(
t1
,
_mm256_mul_ps
(
a1_1
,
h_6_1
));
#endif
__m256
h_5_4
=
_mm256_broadcast_ss
(
&
hh
[(
ldh
*
4
)
+
1
]);
__m256
h_5_3
=
_mm256_broadcast_ss
(
&
hh
[(
ldh
*
4
)
+
2
]);
__m256
h_5_2
=
_mm256_broadcast_ss
(
&
hh
[(
ldh
*
4
)
+
3
]);
__m256
h_5_1
=
_mm256_broadcast_ss
(
&
hh
[(
ldh
*
4
)
+
4
]);
#ifdef __ELPA_USE_FMA__
register
__m256
v1
=
_mm256_FMA_ps
(
a4_1
,
h_5_4
,
a5_1
);
v1
=
_mm256_FMA_ps
(
a3_1
,
h_5_3
,
v1
);
v1
=
_mm256_FMA_ps
(
a2_1
,
h_5_2
,
v1
);
v1
=
_mm256_FMA_ps
(
a1_1
,
h_5_1
,
v1
);
#else
register
__m256
v1
=
_mm256_add_ps
(
a5_1
,
_mm256_mul_ps
(
a4_1
,
h_5_4
));
v1
=
_mm256_add_ps
(
v1
,
_mm256_mul_ps
(
a3_1
,
h_5_3
));
v1
=
_mm256_add_ps
(
v1
,
_mm256_mul_ps
(
a2_1
,
h_5_2
));
v1
=
_mm256_add_ps
(
v1
,
_mm256_mul_ps
(
a1_1
,
h_5_1
));
#endif
__m256
h_4_3
=
_mm256_broadcast_ss
(
&
hh
[(
ldh
*
3
)
+
1
]);
__m256
h_4_2
=
_mm256_broadcast_ss
(
&
hh
[(
ldh
*
3
)
+
2
]);
__m256
h_4_1
=
_mm256_broadcast_ss
(
&
hh
[(
ldh
*
3
)
+
3
]);
#ifdef __ELPA_USE_FMA__
register
__m256
w1
=
_mm256_FMA_ps
(
a3_1
,
h_4_3
,
a4_1
);
w1
=
_mm256_FMA_ps
(
a2_1
,
h_4_2
,
w1
);
w1
=
_mm256_FMA_ps
(
a1_1
,
h_4_1
,
w1
);
#else
register
__m256
w1
=
_mm256_add_ps
(
a4_1
,
_mm256_mul_ps
(
a3_1
,
h_4_3
));
w1
=
_mm256_add_ps
(
w1
,
_mm256_mul_ps
(
a2_1
,
h_4_2
));
w1
=
_mm256_add_ps
(
w1
,
_mm256_mul_ps
(
a1_1
,
h_4_1
));
#endif
__m256
h_2_1
=
_mm256_broadcast_ss
(
&
hh
[
ldh
+
1
]);
__m256
h_3_2
=
_mm256_broadcast_ss
(
&
hh
[(
ldh
*
2
)
+
1
]);
__m256
h_3_1
=
_mm256_broadcast_ss
(
&
hh
[(
ldh
*
2
)
+
2
]);
#ifdef __ELPA_USE_FMA__
register
__m256
z1
=
_mm256_FMA_ps
(
a2_1
,
h_3_2
,
a3_1
);
z1
=
_mm256_FMA_ps
(
a1_1
,
h_3_1
,
z1
);
register
__m256
y1
=
_mm256_FMA_ps
(
a1_1
,
h_2_1
,
a2_1
);
#else
register
__m256
z1
=
_mm256_add_ps
(
a3_1
,
_mm256_mul_ps
(
a2_1
,
h_3_2
));
z1
=
_mm256_add_ps
(
z1
,
_mm256_mul_ps
(
a1_1
,
h_3_1
));
register
__m256
y1
=
_mm256_add_ps
(
a2_1
,
_mm256_mul_ps
(
a1_1
,
h_2_1
));
#endif
register
__m256
x1
=
a1_1
;
// __m256d a1_2 = _mm256_load_pd(&q[(ldq*5)+4]);
// __m256d a2_2 = _mm256_load_pd(&q[(ldq*4)+4]);
// __m256d a3_2 = _mm256_load_pd(&q[(ldq*3)+4]);
// __m256d a4_2 = _mm256_load_pd(&q[(ldq*2)+4]);
// __m256d a5_2 = _mm256_load_pd(&q[(ldq)+4]);
// __m256d a6_2 = _mm256_load_pd(&q[4]);
#ifdef __ELPA_USE_FMA__
// register __m256d t2 = _mm256_FMA_pd(a5_2, h_6_5, a6_2);
// t2 = _mm256_FMA_pd(a4_2, h_6_4, t2);
// t2 = _mm256_FMA_pd(a3_2, h_6_3, t2);
// t2 = _mm256_FMA_pd(a2_2, h_6_2, t2);
// t2 = _mm256_FMA_pd(a1_2, h_6_1, t2);
// register __m256d v2 = _mm256_FMA_pd(a4_2, h_5_4, a5_2);
// v2 = _mm256_FMA_pd(a3_2, h_5_3, v2);
// v2 = _mm256_FMA_pd(a2_2, h_5_2, v2);
// v2 = _mm256_FMA_pd(a1_2, h_5_1, v2);
// register __m256d w2 = _mm256_FMA_pd(a3_2, h_4_3, a4_2);
// w2 = _mm256_FMA_pd(a2_2, h_4_2, w2);
// w2 = _mm256_FMA_pd(a1_2, h_4_1, w2);
// register __m256d z2 = _mm256_FMA_pd(a2_2, h_3_2, a3_2);
// z2 = _mm256_FMA_pd(a1_2, h_3_1, z2);
// register __m256d y2 = _mm256_FMA_pd(a1_2, h_2_1, a2_2);
#else
// register __m256d t2 = _mm256_add_pd(a6_2, _mm256_mul_pd(a5_2, h_6_5));
// t2 = _mm256_add_pd(t2, _mm256_mul_pd(a4_2, h_6_4));
// t2 = _mm256_add_pd(t2, _mm256_mul_pd(a3_2, h_6_3));
// t2 = _mm256_add_pd(t2, _mm256_mul_pd(a2_2, h_6_2));
// t2 = _mm256_add_pd(t2, _mm256_mul_pd(a1_2, h_6_1));
// register __m256d v2 = _mm256_add_pd(a5_2, _mm256_mul_pd(a4_2, h_5_4));
// v2 = _mm256_add_pd(v2, _mm256_mul_pd(a3_2, h_5_3));
// v2 = _mm256_add_pd(v2, _mm256_mul_pd(a2_2, h_5_2));
// v2 = _mm256_add_pd(v2, _mm256_mul_pd(a1_2, h_5_1));
// register __m256d w2 = _mm256_add_pd(a4_2, _mm256_mul_pd(a3_2, h_4_3));
// w2 = _mm256_add_pd(w2, _mm256_mul_pd(a2_2, h_4_2));
// w2 = _mm256_add_pd(w2, _mm256_mul_pd(a1_2, h_4_1));
// register __m256d z2 = _mm256_add_pd(a3_2, _mm256_mul_pd(a2_2, h_3_2));
// z2 = _mm256_add_pd(z2, _mm256_mul_pd(a1_2, h_3_1));
// register __m256d y2 = _mm256_add_pd(a2_2, _mm256_mul_pd(a1_2, h_2_1));
#endif
// register __m256d x2 = a1_2;
__m512
a1_1
=
_mm512_load_ps
(
&
q
[
ldq
*
5
]);
__m512
a2_1
=
_mm512_load_ps
(
&
q
[
ldq
*
4
]);
__m512
a3_1
=
_mm512_load_ps
(
&
q
[
ldq
*
3
]);
__m512
a4_1
=
_mm512_load_ps
(
&
q
[
ldq
*
2
]);
__m512
a5_1
=
_mm512_load_ps
(
&
q
[
ldq
]);
__m512
a6_1
=
_mm512_load_ps
(
&
q
[
0
]);
__m512
h_6_5
=
_mm512_set1_ps
(
hh
[(
ldh
*
5
)
+
1
]);
__m512
h_6_4
=
_mm512_set1_ps
(
hh
[(
ldh
*
5
)
+
2
]);
__m512
h_6_3
=
_mm512_set1_ps
(
hh
[(
ldh
*
5
)
+
3
]);
__m512
h_6_2
=
_mm512_set1_ps
(
hh
[(
ldh
*
5
)
+
4
]);
__m512
h_6_1
=
_mm512_set1_ps
(
hh
[(
ldh
*
5
)
+
5
]);
// register __m512d t1 = _mm512_FMA_ps(a5_1, h_6_5, a6_1);
__m512
t1
=
_mm512_FMA_ps
(
a5_1
,
h_6_5
,
a6_1
);
t1
=
_mm512_FMA_ps
(
a4_1
,
h_6_4
,
t1
);
t1
=
_mm512_FMA_ps
(
a3_1
,
h_6_3
,
t1
);
t1
=
_mm512_FMA_ps
(
a2_1
,
h_6_2
,
t1
);
t1
=
_mm512_FMA_ps
(
a1_1
,
h_6_1
,
t1
);
__m512
h_5_4
=
_mm512_set1_ps
(
hh
[(
ldh
*
4
)
+
1
]);
__m512
h_5_3
=
_mm512_set1_ps
(
hh
[(
ldh
*
4
)
+
2
]);
__m512
h_5_2
=
_mm512_set1_ps
(
hh
[(
ldh
*
4
)
+
3
]);
__m512
h_5_1
=
_mm512_set1_ps
(
hh
[(
ldh
*
4
)
+
4
]);
// register __m512d v1 = _mm512_FMA_ps(a4_1, h_5_4, a5_1);
__m512
v1
=
_mm512_FMA_ps
(
a4_1
,
h_5_4
,
a5_1
);
v1
=
_mm512_FMA_ps
(
a3_1
,
h_5_3
,
v1
);
v1
=
_mm512_FMA_ps
(
a2_1
,
h_5_2
,
v1
);
v1
=
_mm512_FMA_ps
(
a1_1
,
h_5_1
,
v1
);
__m512
h_4_3
=
_mm512_set1_ps
(
hh
[(
ldh
*
3
)
+
1
]);
__m512
h_4_2
=
_mm512_set1_ps
(
hh
[(
ldh
*
3
)
+
2
]);
__m512
h_4_1
=
_mm512_set1_ps
(
hh
[(
ldh
*
3
)
+
3
]);
// register __m512d w1 = _mm512_FMA_ps(a3_1, h_4_3, a4_1);
__m512
w1
=
_mm512_FMA_ps
(
a3_1
,
h_4_3
,
a4_1
);
w1
=
_mm512_FMA_ps
(
a2_1
,
h_4_2
,
w1
);
w1
=
_mm512_FMA_ps
(
a1_1
,
h_4_1
,
w1
);
__m512
h_2_1
=
_mm512_set1_ps
(
hh
[
ldh
+
1
]);
__m512
h_3_2
=
_mm512_set1_ps
(
hh
[(
ldh
*
2
)
+
1
]);
__m512
h_3_1
=
_mm512_set1_ps
(
hh
[(
ldh
*
2
)
+
2
]);
// register __m512d z1 = _mm512_FMA_ps(a2_1, h_3_2, a3_1);
__m512
z1
=
_mm512_FMA_ps
(
a2_1
,
h_3_2
,
a3_1
);
z1
=
_mm512_FMA_ps
(
a1_1
,
h_3_1
,
z1
);
// register __m512d y1 = _mm512_FMA_ps(a1_1, h_2_1, a2_1);
__m512
y1
=
_mm512_FMA_ps
(
a1_1
,
h_2_1
,
a2_1
);
// register __m512d x1 = a1_1;
__m512
x1
=
a1_1
;
__m512
a1_2
=
_mm512_load_ps
(
&
q
[(
ldq
*
5
)
+
16
]);
__m512
a2_2
=
_mm512_load_ps
(
&
q
[(
ldq
*
4
)
+
16
]);
__m512
a3_2
=
_mm512_load_ps
(
&
q
[(
ldq
*
3
)
+
16
]);
__m512
a4_2
=
_mm512_load_ps
(
&
q
[(
ldq
*
2
)
+
16
]);
__m512
a5_2
=
_mm512_load_ps
(
&
q
[(
ldq
)
+
16
]);
__m512
a6_2
=
_mm512_load_ps
(
&
q
[
0
+
16
]);
// register __m512d t2 = _mm512_FMA_ps(a5_2, h_6_5, a6_2);
__m512
t2
=
_mm512_FMA_ps
(
a5_2
,
h_6_5
,
a6_2
);
t2
=
_mm512_FMA_ps
(
a4_2
,
h_6_4
,
t2
);
t2
=
_mm512_FMA_ps
(
a3_2
,
h_6_3
,
t2
);
t2
=
_mm512_FMA_ps
(
a2_2
,
h_6_2
,
t2
);
t2
=
_mm512_FMA_ps
(
a1_2
,
h_6_1
,
t2
);
// register __m512d v2 = _mm512_FMA_ps(a4_2, h_5_4, a5_2);
__m512
v2
=
_mm512_FMA_ps
(
a4_2
,
h_5_4
,
a5_2
);
v2
=
_mm512_FMA_ps
(
a3_2
,
h_5_3
,
v2
);
v2
=
_mm512_FMA_ps
(
a2_2
,
h_5_2
,
v2
);
v2
=
_mm512_FMA_ps
(
a1_2
,
h_5_1
,
v2
);
// register __m512d w2 = _mm512_FMA_ps(a3_2, h_4_3, a4_2);
__m512
w2
=
_mm512_FMA_ps
(
a3_2
,
h_4_3
,
a4_2
);
w2
=
_mm512_FMA_ps
(
a2_2
,
h_4_2
,
w2
);
w2
=
_mm512_FMA_ps
(
a1_2
,
h_4_1
,
w2
);
// register __m512d z2 = _mm512_FMA_ps(a2_2, h_3_2, a3_2);
__m512
z2
=
_mm512_FMA_ps
(
a2_2
,
h_3_2
,
a3_2
);
z2
=
_mm512_FMA_ps
(
a1_2
,
h_3_1
,
z2
);
// register __m512d y2 = _mm512_FMA_ps(a1_2, h_2_1, a2_2);
__m512
y2
=
_mm512_FMA_ps
(
a1_2
,
h_2_1
,
a2_2
);
// register __m512d x2 = a1_2;
__m512
x2
=
a1_2
;
__m512
a1_3
=
_mm512_load_ps
(
&
q
[(
ldq
*
5
)
+
32
]);
__m512
a2_3
=
_mm512_load_ps
(
&
q
[(
ldq
*
4
)
+
32
]);
__m512
a3_3
=
_mm512_load_ps
(
&
q
[(
ldq
*
3
)
+
32
]);
__m512
a4_3
=
_mm512_load_ps
(
&
q
[(
ldq
*
2
)
+
32
]);
__m512
a5_3
=
_mm512_load_ps
(
&
q
[(
ldq
)
+
32
]);
__m512
a6_3
=
_mm512_load_ps
(
&
q
[
0
+
32
]);
// register __m512d t3 = _mm512_FMA_ps(a5_3, h_6_5, a6_3);
__m512
t3
=
_mm512_FMA_ps
(
a5_3
,
h_6_5
,
a6_3
);
t3
=
_mm512_FMA_ps
(
a4_3
,
h_6_4
,
t3
);
t3
=
_mm512_FMA_ps
(
a3_3
,
h_6_3
,
t3
);
t3
=
_mm512_FMA_ps
(
a2_3
,
h_6_2
,
t3
);
t3
=
_mm512_FMA_ps
(
a1_3
,
h_6_1
,
t3
);
// register __m512d v3 = _mm512_FMA_ps(a4_3, h_5_4, a5_3);
__m512
v3
=
_mm512_FMA_ps
(
a4_3
,
h_5_4
,
a5_3
);
v3
=
_mm512_FMA_ps
(
a3_3
,
h_5_3
,
v3
);
v3
=
_mm512_FMA_ps
(
a2_3
,
h_5_2
,
v3
);
v3
=
_mm512_FMA_ps
(
a1_3
,
h_5_1
,
v3
);
// register __m512d w3 = _mm512_FMA_ps(a3_3, h_4_3, a4_3);
__m512
w3
=
_mm512_FMA_ps
(
a3_3
,
h_4_3
,
a4_3
);
w3
=
_mm512_FMA_ps
(
a2_3
,
h_4_2
,
w3
);
w3
=
_mm512_FMA_ps
(
a1_3
,
h_4_1
,
w3
);
// register __m512d z3 = _mm512_FMA_ps(a2_3, h_3_2, a3_3);
__m512
z3
=
_mm512_FMA_ps
(
a2_3
,
h_3_2
,
a3_3
);
z3
=
_mm512_FMA_ps
(
a1_3
,
h_3_1
,
z3
);
// register __m512d y3 = _mm512_FMA_ps(a1_3, h_2_1, a2_3);
__m512
y3
=
_mm512_FMA_ps
(
a1_3
,
h_2_1
,
a2_3
);
// register __m512d x3 = a1_3;
__m512
x3
=
a1_3
;
__m512
a1_4
=
_mm512_load_ps
(
&
q
[(
ldq
*
5
)
+
48
]);
__m512
a2_4
=
_mm512_load_ps
(
&
q
[(
ldq
*
4
)
+
48
]);
__m512
a3_4
=
_mm512_load_ps
(
&
q
[(
ldq
*
3
)
+
48
]);
__m512
a4_4
=
_mm512_load_ps
(
&
q
[(
ldq
*
2
)
+
48
]);
__m512
a5_4
=
_mm512_load_ps
(
&
q
[(
ldq
)
+
48
]);
__m512
a6_4
=
_mm512_load_ps
(
&
q
[
0
+
48
]);
// register __m512d t4 = _mm512_FMA_ps(a5_4, h_6_5, a6_4);
__m512
t4
=
_mm512_FMA_ps
(
a5_4
,
h_6_5
,
a6_4
);
t4
=
_mm512_FMA_ps
(
a4_4
,
h_6_4
,
t4
);
t4
=
_mm512_FMA_ps
(
a3_4
,
h_6_3
,
t4
);
t4
=
_mm512_FMA_ps
(
a2_4
,
h_6_2
,
t4
);
t4
=
_mm512_FMA_ps
(
a1_4
,
h_6_1
,
t4
);
// register __m512d v4 = _mm512_FMA_ps(a4_4, h_5_4, a5_4);
__m512
v4
=
_mm512_FMA_ps
(
a4_4
,
h_5_4
,
a5_4
);
v4
=
_mm512_FMA_ps
(
a3_4
,
h_5_3
,
v4
);
v4
=
_mm512_FMA_ps
(
a2_4
,
h_5_2
,
v4
);
v4
=
_mm512_FMA_ps
(
a1_4
,
h_5_1
,
v4
);
// register __m512d w4 = _mm512_FMA_ps(a3_4, h_4_3, a4_4);
__m512
w4
=
_mm512_FMA_ps
(
a3_4
,
h_4_3
,
a4_4
);
__m256
q1
;
// __m256d q2
;
w4
=
_mm512_FMA_ps
(
a2_4
,
h_4_2
,
w4
)
;
w4
=
_mm512_FMA_ps
(
a1_4
,
h_4_1
,
w4
)
;
__m256
h1
;
__m256
h2
;
__m256
h3
;
__m256
h4
;
__m256
h5
;
__m256
h6
;
// register __m512d z4 = _mm512_FMA_ps(a2_4, h_3_2, a3_4);
__m512
z4
=
_mm512_FMA_ps
(
a2_4
,
h_3_2
,
a3_4
);
z4
=
_mm512_FMA_ps
(
a1_4
,
h_3_1
,
z4
);
// register __m512d y4 = _mm512_FMA_ps(a1_4, h_2_1, a2_4);
__m512
y4
=
_mm512_FMA_ps
(
a1_4
,
h_2_1
,
a2_4
);
// register __m512d x4 = a1_4;
__m512
x4
=
a1_4
;
__m512
q1
;
__m512
q2
;
__m512
q3
;
__m512
q4
;
__m512
h1
;
__m512
h2
;
__m512
h3
;
__m512
h4
;
__m512
h5
;
__m512
h6
;
for
(
i
=
6
;
i
<
nb
;
i
++
)
{
h1
=
_mm256_broadcast_ss
(
&
hh
[
i
-
5
]);
q1
=
_mm256_load_ps
(
&
q
[
i
*
ldq
]);
// q2 = _mm256_load_pd(&q[(i*ldq)+4]);
#ifdef __ELPA_USE_FMA__
x1
=
_mm256_FMA_ps
(
q1
,
h1
,
x1
);
// x2 = _mm256_FMA_pd(q2, h1, x2);
#else
x1
=
_mm256_add_ps
(
x1
,
_mm256_mul_ps
(
q1
,
h1
));
// x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
#endif
h2
=
_mm256_broadcast_ss
(
&
hh
[
ldh
+
i
-
4
]);
#ifdef __ELPA_USE_FMA__
y1
=
_mm256_FMA_ps
(
q1
,
h2
,
y1
);
// y2 = _mm256_FMA_pd(q2, h2, y2);
#else
y1
=
_mm256_add_ps
(
y1
,
_mm256_mul_ps
(
q1
,
h2
));
// y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
#endif
h3
=
_mm256_broadcast_ss
(
&
hh
[(
ldh
*
2
)
+
i
-
3
]);
#ifdef __ELPA_USE_FMA__
z1
=
_mm256_FMA_ps
(
q1
,
h3
,
z1
);
// z2 = _mm256_FMA_pd(q2, h3, z2);
#else
z1
=
_mm256_add_ps
(
z1
,
_mm256_mul_ps
(
q1
,
h3
));
// z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3));
#endif
h4
=
_mm256_broadcast_ss
(
&
hh
[(
ldh
*
3
)
+
i
-
2
]);
#ifdef __ELPA_USE_FMA__
w1
=
_mm256_FMA_ps
(
q1
,
h4
,
w1
);
// w2 = _mm256_FMA_pd(q2, h4, w2);
#else
w1
=
_mm256_add_ps
(
w1
,
_mm256_mul_ps
(
q1
,
h4
));
// w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4));
#endif
h5
=
_mm256_broadcast_ss
(
&
hh
[(
ldh
*
4
)
+
i
-
1
]);
#ifdef __ELPA_USE_FMA__
v1
=
_mm256_FMA_ps
(
q1
,
h5
,
v1
);
// v2 = _mm256_FMA_pd(q2, h5, v2);
#else
v1
=
_mm256_add_ps
(
v1
,
_mm256_mul_ps
(
q1
,
h5
));
// v2 = _mm256_add_pd(v2, _mm256_mul_pd(q2,h5));
#endif
h6
=
_mm256_broadcast_ss
(
&
hh
[(
ldh
*
5
)
+
i
]);
#ifdef __ELPA_USE_FMA__
t1
=
_mm256_FMA_ps
(
q1
,
h6
,
t1
);
// t2 = _mm256_FMA_pd(q2, h6, t2);
#else
t1
=
_mm256_add_ps
(
t1
,
_mm256_mul_ps
(
q1
,
h6
));
// t2 = _mm256_add_pd(t2, _mm256_mul_pd(q2,h6));
#endif
h1
=
_mm512_set1_ps
(
hh
[
i
-
5
]);
q1
=
_mm512_load_ps
(
&
q
[
i
*
ldq
]);
q2
=
_mm512_load_ps
(
&
q
[(
i
*
ldq
)
+
16
]);
q3
=
_mm512_load_ps
(
&
q
[(
i
*
ldq
)
+
32
]);
q4
=
_mm512_load_ps
(
&
q
[(
i
*
ldq
)
+
48
]);
x1
=
_mm512_FMA_ps
(
q1
,
h1
,
x1
);
x2
=
_mm512_FMA_ps
(
q2
,
h1
,
x2
);
x3
=
_mm512_FMA_ps
(
q3
,
h1
,
x3
);
x4
=
_mm512_FMA_ps
(
q4
,
h1
,
x4
);
h2
=
_mm512_set1_ps
(
hh
[
ldh
+
i
-
4
]);
y1
=
_mm512_FMA_ps
(
q1
,
h2
,
y1
);
y2
=
_mm512_FMA_ps
(
q2
,
h2
,
y2
);
y3
=
_mm512_FMA_ps
(
q3
,
h2
,
y3
);
y4
=
_mm512_FMA_ps
(
q4
,
h2
,
y4
);
h3
=
_mm512_set1_ps
(
hh
[(
ldh
*
2
)
+
i
-
3
]);
z1
=
_mm512_FMA_ps
(
q1
,
h3
,
z1
);
z2
=
_mm512_FMA_ps
(
q2
,
h3
,
z2
);
z3
=
_mm512_FMA_ps
(
q3
,
h3
,
z3
);
z4
=
_mm512_FMA_ps
(
q4
,
h3
,
z4
);
h4
=
_mm512_set1_ps
(
hh
[(
ldh
*
3
)
+
i
-
2
]);
w1
=
_mm512_FMA_ps
(
q1
,
h4
,
w1
);
w2
=
_mm512_FMA_ps
(
q2
,
h4
,
w2
);
w3
=
_mm512_FMA_ps
(
q3
,
h4
,
w3
);
w4
=
_mm512_FMA_ps
(
q4
,
h4
,
w4
);
h5
=
_mm512_set1_ps
(
hh
[(
ldh
*
4
)
+
i
-
1
]);
v1
=
_mm512_FMA_ps
(
q1
,
h5
,
v1
);
v2
=
_mm512_FMA_ps
(
q2
,
h5
,
v2
);
v3
=
_mm512_FMA_ps
(
q3
,
h5
,
v3
);
v4
=
_mm512_FMA_ps
(
q4
,
h5
,
v4
);
h6
=
_mm512_set1_ps
(
hh
[(
ldh
*
5
)
+
i
]);
t1
=
_mm512_FMA_ps
(
q1
,
h6
,
t1
);