Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
a6280511
Commit
a6280511
authored
Jan 23, 2018
by
Andreas Marek
Browse files
Merge branch 'master_pre_stage' into cleanup_scripts
parents
4143c245
bdea6351
Changes
8
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
.gitlab-ci.yml
View file @
a6280511
This diff is collapsed.
Click to expand it.
ci_test_scripts/generate_gitlab_ci_tests.py
View file @
a6280511
...
...
@@ -614,8 +614,8 @@ for cc, fc, m, o, p, a, b, g, cov, instr, addr, na in product(
memory
=
set_requested_memory
(
matrix_size
[
na
])
# do the configure
print
(
" - export SKIP_STEP=0 "
)
if
(
instr
==
"sse"
or
(
instr
==
"avx"
and
g
!=
"with-gpu"
)):
if
(
instr
==
"sse"
):
print
(
" - if [ $MATRIX_SIZE -gt 150 ]; then export SKIP_STEP=1 ; fi # our SSE test machines do not have a lot of memory"
)
print
(
" - ./ci_test_scripts/run_ci_tests.sh -c
\"
CC=
\\\"
"
+
c_compiler_wrapper
+
"
\\\"
"
+
" CFLAGS=
\\\"
"
+
CFLAGS
+
"
\\\"
"
+
" FC=
\\\"
"
+
fortran_compiler_wrapper
+
"
\\\"
"
+
" FCFLAGS=
\\\"
"
+
FCFLAGS
+
"
\\\"
"
\
...
...
configure.ac
View file @
a6280511
...
...
@@ -851,6 +851,63 @@ if test x"${need_avx512}" = x"yes"; then
AC_MSG_ERROR([Could not compile a test program with AVX512, adjust the C compiler or CFLAGS])
fi
AC_DEFINE([HAVE_AVX512],[1],[AVX512 is supported on this CPU])
if test x"$can_compile_avx512" = x"yes"; then
AC_MSG_CHECKING([whether we compile for Xeon])
AC_RUN_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
__m512d sign;
__m512d h1_real;
__m512d x1 = _mm512_xor_pd(h1_real, sign);
return 0;
}
])],
[can_compile_avx512_xeon=yes],
[can_compile_avx512_xeon=no]
)
AC_MSG_RESULT([${can_compile_avx512_xeon}])
AC_MSG_CHECKING([whether we compile for Xeon PHI])
AC_RUN_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
__m512d sign;
__m512d h1;
__m512d h2_real;
__m512d x1 = (__m512d) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
return 0;
}
])],
[can_compile_avx512_xeon_phi=yes],
[can_compile_avx512_xeon_phi=no]
)
AC_MSG_RESULT([${can_compile_avx512_xeon_phi}])
# this is needed for the intel compiler
if test x"$can_compile_avx512_xeon" = x"yes" ; then
if test x"$can_compile_avx512_xeon_phi" = x"yes" ; then
# we want only one to be true; this is ugly but could not come up with a better way
grep Phi /proc/cpuinfo > /dev/null
if test x"$?" = x"0" ; then
echo "Xeon PHI found ... disabling AVX512 Xeon"
can_compile_avx512_xeon=no
fi
fi
fi
if test x"$can_compile_avx512_xeon" = x"yes"; then
AC_DEFINE([HAVE_AVX512_XEON],[1],[AVX512 for Xeon is supported on this CPU])
else
if test x"$can_compile_avx512_xeon_phi" = x"yes"; then
AC_DEFINE([HAVE_AVX512_XEON_PHI],[1],[AVX512 for Xeon-PHI is supported on this CPU])
else
AC_MSG_ERROR([Oho! We can neither compile AVX512 intrinsics for Xeon nor Xeon Phi. This should not happen!])
fi
fi
fi
fi
AC_LANG_POP([C])
...
...
@@ -1157,5 +1214,5 @@ if test x"$enable_kcomputer" = x"yes" ; then
echo "call: make -f ../generated_headers.am generated-headers top_srcdir=.."
echo "BEFORE triggering the build with make!"
else
make -f $srcdir/generated_headers.am generated-headers top_srcdir="$srcdir"
make -f $srcdir/generated_headers.am generated-headers top_srcdir="$srcdir"
CPP="$CPP"
fi
generate_automake_test_programs.py
View file @
a6280511
...
...
@@ -77,6 +77,10 @@ for lang, m, g, q, t, p, d, s, lay in product(sorted(language_flag.keys()),
if
(
s
in
[
"scalapack_all"
,
"scalapack_part"
]
and
(
g
==
1
or
t
!=
"eigenvectors"
or
m
!=
"analytic"
)):
continue
# do not test single-precision scalapack
if
(
s
in
[
"scalapack_all"
,
"scalapack_part"
]
and
(
p
==
"single"
)):
continue
# solve tridiagonal only for real toeplitz matrix in 1stage
if
(
t
==
"solve_tridiagonal"
and
(
s
!=
"1stage"
or
d
!=
"real"
or
m
!=
"toeplitz"
)):
continue
...
...
src/elpa2/elpa2_bandred_template.F90
View file @
a6280511
...
...
@@ -159,9 +159,9 @@
#endif
integer
(
kind
=
ik
)
::
ierr
integer
(
kind
=
ik
)
::
cur_l_rows
,
cur_l_cols
,
vmr_size
,
umc_size
integer
(
kind
=
c_intptr_t
)
::
lc_start
,
lc_end
integer
(
kind
=
c_intptr_t
)
::
lc_start
,
lc_end
#if COMPLEXCASE == 1
integer
(
kind
=
c_intptr_t
)
::
lce_1
,
lcs_1
,
lre_1
integer
(
kind
=
c_intptr_t
)
::
lce_1
,
lcs_1
,
lre_1
#endif
integer
(
kind
=
ik
)
::
lr_end
integer
(
kind
=
ik
)
::
na_cols
...
...
@@ -185,11 +185,27 @@
&
_
&
&
MATH_DATATYPE
logical
::
useGPU_reduction_lower_block_to_tridiagonal
call
obj
%
timer
%
start
(
"bandred_&
&MATH_DATATYPE&
&"
//
&
&
PRECISION_SUFFIX
&
)
useGPU_reduction_lower_block_to_tridiagonal
=
.false.
if
(
useGPU
)
then
useGPU_reduction_lower_block_to_tridiagonal
=
.true.
#if REALCASE == 1
if
(
useQR
)
then
!in this case switch off GPU usage for step "reduce current block to lower triangular form"
! since this is done by QR decomposition
useGPU_reduction_lower_block_to_tridiagonal
=
.false.
endif
#endif
endif
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_comm_rank
(
mpi_comm_rows
,
my_prow
,
mpierr
)
...
...
@@ -206,18 +222,18 @@
if
(
my_prow
==
0
.and.
my_pcol
==
0
)
then
if
(
wantDebug
)
then
write
(
error_unit
,
*
)
'ELPA2_bandred_&
&MATH_DATATYPE&
&: ERROR: nbw='
,
nbw
,
', nblk='
,
nblk
&MATH_DATATYPE&
&: ERROR: nbw='
,
nbw
,
', nblk='
,
nblk
write
(
error_unit
,
*
)
'ELPA2_bandred_&
&MATH_DATATYPE&
&: ELPA2 works only for nbw==n*nblk'
&MATH_DATATYPE&
&: ELPA2 works only for nbw==n*nblk'
endif
success
=
.false.
return
endif
endif
! na_rows in used nowhere; only na_cols
! na_rows in used nowhere; only na_cols
if
(
useGPU
)
then
#ifdef WITH_MPI
#if COMPLEXCASE == 1
...
...
@@ -268,11 +284,6 @@
#if REALCASE == 1
if
(
useQR
)
then
if
(
useGPU
)
then
print
*
,
"qr decomposition at the moment not supported with GPU"
stop
1
endif
if
(
which_qr_decomposition
==
1
)
then
call
qr_pqrparam_init
(
obj
,
pqrparam
(
1
:
11
),
nblk
,
'M'
,
0
,
nblk
,
'M'
,
0
,
nblk
,
'M'
,
1
,
's'
)
allocate
(
tauvector
(
na
),
stat
=
istat
,
errmsg
=
errorMessage
)
...
...
@@ -533,6 +544,10 @@
! Reduce current block to lower triangular form
#if REALCASE == 1
if
(
useQR
)
then
if
(
useGPU
)
then
! vmrCPU(1:cur_l_rows,1:n_cols) = vmrCUDA(1 : cur_l_rows * n_cols)
endif
if
(
which_qr_decomposition
==
1
)
then
vmrCols
=
2
*
n_cols
#ifdef USE_ASSUMED_SIZE_QR
...
...
@@ -637,7 +652,7 @@
#endif /* WITH_MPI */
if
(
useGPU
)
then
if
(
useGPU
_reduction_lower_block_to_tridiagonal
)
then
vmrCUDA
(
cur_l_rows
*
(
lc
-
1
)
+
1
:
cur_l_rows
*
(
lc
-
1
)
+
lr
)
=
vr
(
1
:
lr
)
else
vmrCPU
(
1
:
lr
,
lc
)
=
vr
(
1
:
lr
)
...
...
@@ -815,7 +830,7 @@
#endif /* WITH_OPENMP */
enddo
! lc
if
(
useGPU
)
then
if
(
useGPU
_reduction_lower_block_to_tridiagonal
)
then
! store column tiles back to GPU
cur_pcol
=
pcol
(
istep
*
nbw
+1
,
nblk
,
np_cols
)
if
(
my_pcol
==
cur_pcol
)
then
...
...
@@ -841,7 +856,7 @@
vav
=
0
call
obj
%
timer
%
start
(
"blas"
)
if
(
useGPU
)
then
if
(
useGPU
_reduction_lower_block_to_tridiagonal
)
then
if
(
l_rows
>
0
)
&
#if REALCASE == 1
call
PRECISION_SYRK
(
'U'
,
'T'
,
&
...
...
@@ -853,7 +868,7 @@
vmrCUDA
,
cur_l_rows
,
&
ZERO
,
vav
,
ubound
(
vav
,
dim
=
1
))
else
! useGPU
else
! useGPU
_reduction_to_tridiagonal
if
(
l_rows
>
0
)
&
#if REALCASE == 1
call
PRECISION_SYRK
(
'U'
,
'T'
,
&
...
...
@@ -892,6 +907,33 @@
#if REALCASE == 1
endif
!useQR
#endif
#if REALCASE == 1
if
(
useGPU
.and.
useQR
)
then
! copy the data for furhter usage
! qr worked on *CPU arrarys
!vmrCUDA(1:cur_l_rows * n_cols) = vmrCPU(1:cur_l_rows,1:n_cols)
cur_pcol
=
pcol
(
istep
*
nbw
+1
,
nblk
,
np_cols
)
if
(
my_pcol
==
cur_pcol
)
then
successCUDA
=
cuda_memcpy2d
((
a_dev
+
&
int
(((
lc_start
-1
)
*
lda
*
size_of_datatype
),
kind
=
c_intptr_t
)),
&
int
(
lda
*
size_of_datatype
,
kind
=
c_intptr_t
),
loc
(
a
(
1
,
lc_start
)),
&
int
(
lda
*
size_of_datatype
,
kind
=
c_intptr_t
),
&
int
(
lr_end
*
size_of_datatype
,
kind
=
c_intptr_t
),
&
int
((
lc_end
-
lc_start
+1
),
kind
=
c_intptr_t
),
&
int
(
cudaMemcpyHostToDevice
,
kind
=
c_int
))
if
(
.not.
(
successCUDA
))
then
print
*
,
"bandred_&
&MATH_DATATYPE&
&: cuda memcpy a_dev failed "
,
istat
stop
1
endif
endif
endif
#endif
! Transpose vmr -> vmc (stored in umc, second half)
if
(
useGPU
)
then
call
elpa_transpose_vectors_
&
...
...
@@ -1517,7 +1559,7 @@
endif
endif
!useGPU
enddo
! istep
enddo
! istep
- loop
if
(
useGPU
)
then
successCUDA
=
cuda_free
(
vav_dev
)
...
...
src/elpa2/kernels/complex_avx512_1hv_template.c
View file @
a6280511
...
...
@@ -63,7 +63,9 @@
#define _AVX512_MUL _mm512_mul_pd
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_SHUFFLE _mm512_shuffle_pd
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_pd
#endif
#define _AVX512_XOR_EPI _mm512_xor_epi64
#define _SHUFFLE 0x55
...
...
@@ -87,7 +89,9 @@
#define _AVX512_MUL _mm512_mul_ps
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_SHUFFLE _mm512_shuffle_ps
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_ps
#endif
#define _AVX512_XOR_EPI _mm512_xor_epi32
#define _SHUFFLE 0xb1
...
...
@@ -361,6 +365,7 @@ static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -368,6 +373,13 @@ static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
...
@@ -539,6 +551,7 @@ static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -546,6 +559,13 @@ static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
...
@@ -688,9 +708,10 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float com
x4
=
_AVX512_ADD
(
x4
,
_AVX512_FMSUBADD
(
h1_real
,
q4
,
_AVX512_SHUFFLE
(
tmp4
,
tmp4
,
_SHUFFLE
)));
}
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -698,6 +719,13 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
...
@@ -824,6 +852,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -831,6 +860,13 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
...
@@ -934,6 +970,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -941,6 +978,13 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
...
@@ -1022,9 +1066,10 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float comp
x1
=
_AVX512_ADD
(
x1
,
_AVX512_FMSUBADD
(
h1_real
,
q1
,
_AVX512_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
)));
}
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -1032,6 +1077,13 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float comp
#ifdef SINGLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_AVX512_XOR_EPI
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
...
src/elpa2/kernels/complex_avx512_2hv_template.c
View file @
a6280511
...
...
@@ -65,6 +65,9 @@
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_MASK_STOREU _mm512_mask_storeu_pd
#define _AVX512_SHUFFLE _mm512_shuffle_pd
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_pd
#endif
#define _SHUFFLE 0x55
#ifdef HAVE_AVX512
...
...
@@ -90,6 +93,9 @@
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_MASK_STOREU _mm512_mask_storeu_ps
#define _AVX512_SHUFFLE _mm512_shuffle_ps
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_ps
#endif
#define _SHUFFLE 0xb1
#ifdef HAVE_AVX512
...
...
@@ -380,6 +386,7 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -388,7 +395,14 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
x1
=
_AVX512_FMADDSUB
(
h1_real
,
x1
,
_AVX512_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
));
...
...
@@ -409,6 +423,7 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h2_real
=
_AVX512_SET1
(
hh_dbl
[
ldh
*
2
]);
h2_imag
=
_AVX512_SET1
(
hh_dbl
[(
ldh
*
2
)
+
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -426,6 +441,15 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h2_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_real
,
(
__m512i
)
sign
);
h2_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
h2_real
=
_AVX512_XOR
(
h2_real
,
sign
);
h2_imag
=
_AVX512_XOR
(
h2_imag
,
sign
);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2
=
_AVX512_SET
(
s_dbl
[
1
],
s_dbl
[
0
],
...
...
@@ -723,6 +747,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -731,7 +756,14 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
x1
=
_AVX512_FMADDSUB
(
h1_real
,
x1
,
_AVX512_SHUFFLE
(
tmp1
,
tmp1
,
_SHUFFLE
));
...
...
@@ -748,6 +780,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h2_real
=
_AVX512_SET1
(
hh_dbl
[
ldh
*
2
]);
h2_imag
=
_AVX512_SET1
(
hh_dbl
[(
ldh
*
2
)
+
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -765,6 +798,15 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h2_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_real
,
(
__m512i
)
sign
);
h2_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
h2_real
=
_AVX512_XOR
(
h2_real
,
sign
);
h2_imag
=
_AVX512_XOR
(
h2_imag
,
sign
);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2
=
_AVX512_SET
(
s_dbl
[
1
],
s_dbl
[
0
],
...
...
@@ -1008,6 +1050,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -1016,7 +1059,13 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
...
@@ -1031,6 +1080,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h2_real
=
_AVX512_SET1
(
hh_dbl
[
ldh
*
2
]);
h2_imag
=
_AVX512_SET1
(
hh_dbl
[(
ldh
*
2
)
+
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -1048,6 +1098,15 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h2_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_real
,
(
__m512i
)
sign
);
h2_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
h2_real
=
_AVX512_XOR
(
h2_real
,
sign
);
h2_imag
=
_AVX512_XOR
(
h2_imag
,
sign
);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2
=
_AVX512_SET
(
s_dbl
[
1
],
s_dbl
[
0
],
...
...
@@ -1238,6 +1297,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h1_real
=
_AVX512_SET1
(
hh_dbl
[
0
]);
h1_imag
=
_AVX512_SET1
(
hh_dbl
[
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -1246,7 +1306,13 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
#endif
#endif
tmp1
=
_AVX512_MUL
(
h1_imag
,
x1
);
...
...
@@ -1257,6 +1323,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h2_real
=
_AVX512_SET1
(
hh_dbl
[
ldh
*
2
]);
h2_imag
=
_AVX512_SET1
(
hh_dbl
[(
ldh
*
2
)
+
1
]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_real
,
(
__m512i
)
sign
);
h1_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi64
((
__m512i
)
h1_imag
,
(
__m512i
)
sign
);
...
...
@@ -1274,6 +1341,15 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h2_real
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_real
,
(
__m512i
)
sign
);
h2_imag
=
(
__AVX512_DATATYPE
)
_mm512_xor_epi32
((
__m512i
)
h2_imag
,
(
__m512i
)
sign
);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real
=
_AVX512_XOR
(
h1_real
,
sign
);
h1_imag
=
_AVX512_XOR
(
h1_imag
,
sign
);
h2_real
=
_AVX512_XOR
(
h2_real
,
sign
);
h2_imag
=
_AVX512_XOR
(
h2_imag
,
sign
);