Commit a6280511 authored by Andreas Marek's avatar Andreas Marek

Merge branch 'master_pre_stage' into cleanup_scripts

parents 4143c245 bdea6351
This diff is collapsed.
......@@ -614,8 +614,8 @@ for cc, fc, m, o, p, a, b, g, cov, instr, addr, na in product(
memory = set_requested_memory(matrix_size[na])
# do the configure
print(" - export SKIP_STEP=0 ")
if ( instr == "sse" or (instr == "avx" and g != "with-gpu")):
if ( instr == "sse"):
print(" - if [ $MATRIX_SIZE -gt 150 ]; then export SKIP_STEP=1 ; fi # our SSE test machines do not have a lot of memory")
print(" - ./ci_test_scripts/run_ci_tests.sh -c \" CC=\\\""+c_compiler_wrapper+"\\\"" + " CFLAGS=\\\""+CFLAGS+"\\\"" + " FC=\\\""+fortran_compiler_wrapper+"\\\"" + " FCFLAGS=\\\""+FCFLAGS+"\\\"" \
......
......@@ -851,6 +851,63 @@ if test x"${need_avx512}" = x"yes"; then
AC_MSG_ERROR([Could not compile a test program with AVX512, adjust the C compiler or CFLAGS])
fi
AC_DEFINE([HAVE_AVX512],[1],[AVX512 is supported on this CPU])
if test x"$can_compile_avx512" = x"yes"; then
AC_MSG_CHECKING([whether we compile for Xeon])
AC_RUN_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
__m512d sign;
__m512d h1_real;
__m512d x1 = _mm512_xor_pd(h1_real, sign);
return 0;
}
])],
[can_compile_avx512_xeon=yes],
[can_compile_avx512_xeon=no]
)
AC_MSG_RESULT([${can_compile_avx512_xeon}])
AC_MSG_CHECKING([whether we compile for Xeon PHI])
AC_RUN_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
__m512d sign;
__m512d h1;
__m512d h2_real;
__m512d x1 = (__m512d) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
return 0;
}
])],
[can_compile_avx512_xeon_phi=yes],
[can_compile_avx512_xeon_phi=no]
)
AC_MSG_RESULT([${can_compile_avx512_xeon_phi}])
# this is needed for the intel compiler
if test x"$can_compile_avx512_xeon" = x"yes" ; then
if test x"$can_compile_avx512_xeon_phi" = x"yes" ; then
# we want only one to be true; this is ugly but could not come up with a better way
grep Phi /proc/cpuinfo > /dev/null
if test x"$?" = x"0" ; then
echo "Xeon PHI found ... disabling AVX512 Xeon"
can_compile_avx512_xeon=no
fi
fi
fi
if test x"$can_compile_avx512_xeon" = x"yes"; then
AC_DEFINE([HAVE_AVX512_XEON],[1],[AVX512 for Xeon is supported on this CPU])
else
if test x"$can_compile_avx512_xeon_phi" = x"yes"; then
AC_DEFINE([HAVE_AVX512_XEON_PHI],[1],[AVX512 for Xeon-PHI is supported on this CPU])
else
AC_MSG_ERROR([Oho! We can neither compile AVX512 intrinsics for Xeon nor Xeon Phi. This should not happen!])
fi
fi
fi
fi
AC_LANG_POP([C])
......@@ -1157,5 +1214,5 @@ if test x"$enable_kcomputer" = x"yes" ; then
echo "call: make -f ../generated_headers.am generated-headers top_srcdir=.."
echo "BEFORE triggering the build with make!"
else
make -f $srcdir/generated_headers.am generated-headers top_srcdir="$srcdir"
make -f $srcdir/generated_headers.am generated-headers top_srcdir="$srcdir" CPP="$CPP"
fi
......@@ -77,6 +77,10 @@ for lang, m, g, q, t, p, d, s, lay in product(sorted(language_flag.keys()),
if(s in ["scalapack_all", "scalapack_part"] and (g == 1 or t != "eigenvectors" or m != "analytic")):
continue
# do not test single-precision scalapack
if(s in ["scalapack_all", "scalapack_part"] and ( p == "single")):
continue
# solve tridiagonal only for real toeplitz matrix in 1stage
if (t == "solve_tridiagonal" and (s != "1stage" or d != "real" or m != "toeplitz")):
continue
......
......@@ -159,9 +159,9 @@
#endif
integer(kind=ik) :: ierr
integer(kind=ik) :: cur_l_rows, cur_l_cols, vmr_size, umc_size
integer(kind=c_intptr_t) :: lc_start, lc_end
integer(kind=c_intptr_t) :: lc_start, lc_end
#if COMPLEXCASE == 1
integer(kind=c_intptr_t) :: lce_1, lcs_1, lre_1
integer(kind=c_intptr_t) :: lce_1, lcs_1, lre_1
#endif
integer(kind=ik) :: lr_end
integer(kind=ik) :: na_cols
......@@ -185,11 +185,27 @@
&_&
&MATH_DATATYPE
logical :: useGPU_reduction_lower_block_to_tridiagonal
call obj%timer%start("bandred_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
)
useGPU_reduction_lower_block_to_tridiagonal = .false.
if (useGPU) then
useGPU_reduction_lower_block_to_tridiagonal = .true.
#if REALCASE == 1
if (useQR) then
!in this case switch off GPU usage for step "reduce current block to lower triangular form"
! since this is done by QR decomposition
useGPU_reduction_lower_block_to_tridiagonal = .false.
endif
#endif
endif
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
......@@ -206,18 +222,18 @@
if (my_prow==0 .and. my_pcol==0) then
if (wantDebug) then
write(error_unit,*) 'ELPA2_bandred_&
&MATH_DATATYPE&
&: ERROR: nbw=',nbw,', nblk=',nblk
&MATH_DATATYPE&
&: ERROR: nbw=',nbw,', nblk=',nblk
write(error_unit,*) 'ELPA2_bandred_&
&MATH_DATATYPE&
&: ELPA2 works only for nbw==n*nblk'
&MATH_DATATYPE&
&: ELPA2 works only for nbw==n*nblk'
endif
success = .false.
return
endif
endif
! na_rows in used nowhere; only na_cols
! na_rows in used nowhere; only na_cols
if (useGPU) then
#ifdef WITH_MPI
#if COMPLEXCASE == 1
......@@ -268,11 +284,6 @@
#if REALCASE == 1
if (useQR) then
if (useGPU) then
print *,"qr decomposition at the moment not supported with GPU"
stop 1
endif
if (which_qr_decomposition == 1) then
call qr_pqrparam_init(obj,pqrparam(1:11), nblk,'M',0, nblk,'M',0, nblk,'M',1,'s')
allocate(tauvector(na), stat=istat, errmsg=errorMessage)
......@@ -533,6 +544,10 @@
! Reduce current block to lower triangular form
#if REALCASE == 1
if (useQR) then
if (useGPU) then
! vmrCPU(1:cur_l_rows,1:n_cols) = vmrCUDA(1 : cur_l_rows * n_cols)
endif
if (which_qr_decomposition == 1) then
vmrCols = 2*n_cols
#ifdef USE_ASSUMED_SIZE_QR
......@@ -637,7 +652,7 @@
#endif /* WITH_MPI */
if (useGPU) then
if (useGPU_reduction_lower_block_to_tridiagonal) then
vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
else
vmrCPU(1:lr,lc) = vr(1:lr)
......@@ -815,7 +830,7 @@
#endif /* WITH_OPENMP */
enddo ! lc
if (useGPU) then
if (useGPU_reduction_lower_block_to_tridiagonal) then
! store column tiles back to GPU
cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if (my_pcol == cur_pcol) then
......@@ -841,7 +856,7 @@
vav = 0
call obj%timer%start("blas")
if (useGPU) then
if (useGPU_reduction_lower_block_to_tridiagonal) then
if (l_rows>0) &
#if REALCASE == 1
call PRECISION_SYRK('U', 'T', &
......@@ -853,7 +868,7 @@
vmrCUDA, cur_l_rows, &
ZERO, vav, ubound(vav,dim=1))
else ! useGPU
else ! useGPU_reduction_to_tridiagonal
if (l_rows>0) &
#if REALCASE == 1
call PRECISION_SYRK('U', 'T', &
......@@ -892,6 +907,33 @@
#if REALCASE == 1
endif !useQR
#endif
#if REALCASE == 1
if (useGPU .and. useQR) then
! copy the data for furhter usage
! qr worked on *CPU arrarys
!vmrCUDA(1:cur_l_rows * n_cols) = vmrCPU(1:cur_l_rows,1:n_cols)
cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d((a_dev+ &
int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), &
int(lda*size_of_datatype,kind=c_intptr_t), loc(a(1,lc_start)), &
int(lda*size_of_datatype,kind=c_intptr_t), &
int(lr_end*size_of_datatype,kind=c_intptr_t), &
int((lc_end - lc_start+1),kind=c_intptr_t), &
int(cudaMemcpyHostToDevice,kind=c_int))
if (.not.(successCUDA)) then
print *, "bandred_&
&MATH_DATATYPE&
&: cuda memcpy a_dev failed ", istat
stop 1
endif
endif
endif
#endif
! Transpose vmr -> vmc (stored in umc, second half)
if (useGPU) then
call elpa_transpose_vectors_&
......@@ -1517,7 +1559,7 @@
endif
endif !useGPU
enddo ! istep
enddo ! istep - loop
if (useGPU) then
successCUDA = cuda_free(vav_dev)
......
......@@ -63,7 +63,9 @@
#define _AVX512_MUL _mm512_mul_pd
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_SHUFFLE _mm512_shuffle_pd
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_pd
#endif
#define _AVX512_XOR_EPI _mm512_xor_epi64
#define _SHUFFLE 0x55
......@@ -87,7 +89,9 @@
#define _AVX512_MUL _mm512_mul_ps
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_SHUFFLE _mm512_shuffle_ps
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_ps
#endif
#define _AVX512_XOR_EPI _mm512_xor_epi32
#define _SHUFFLE 0xb1
......@@ -361,6 +365,7 @@ static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
......@@ -368,6 +373,13 @@ static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -539,6 +551,7 @@ static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
......@@ -546,6 +559,13 @@ static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -688,9 +708,10 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float com
x4 = _AVX512_ADD(x4, _AVX512_FMSUBADD(h1_real, q4, _AVX512_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
}
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
......@@ -698,6 +719,13 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -824,6 +852,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
......@@ -831,6 +860,13 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -934,6 +970,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
......@@ -941,6 +978,13 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float com
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -1022,9 +1066,10 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float comp
x1 = _AVX512_ADD(x1, _AVX512_FMSUBADD(h1_real, q1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
}
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
......@@ -1032,6 +1077,13 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float comp
#ifdef SINGLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......
......@@ -65,6 +65,9 @@
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_MASK_STOREU _mm512_mask_storeu_pd
#define _AVX512_SHUFFLE _mm512_shuffle_pd
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_pd
#endif
#define _SHUFFLE 0x55
#ifdef HAVE_AVX512
......@@ -90,6 +93,9 @@
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_MASK_STOREU _mm512_mask_storeu_ps
#define _AVX512_SHUFFLE _mm512_shuffle_ps
#ifdef HAVE_AVX512_XEON
#define _AVX512_XOR _mm512_xor_ps
#endif
#define _SHUFFLE 0xb1
#ifdef HAVE_AVX512
......@@ -380,6 +386,7 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -388,7 +395,14 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
......@@ -409,6 +423,7 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -426,6 +441,15 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_2hv_single(float com
h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h2_real = _AVX512_XOR(h2_real, sign);
h2_imag = _AVX512_XOR(h2_imag, sign);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
......@@ -723,6 +747,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -731,7 +756,14 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
......@@ -748,6 +780,7 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -765,6 +798,15 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_2hv_single(float com
h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h2_real = _AVX512_XOR(h2_real, sign);
h2_imag = _AVX512_XOR(h2_imag, sign);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
......@@ -1008,6 +1050,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -1016,7 +1059,13 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -1031,6 +1080,7 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -1048,6 +1098,15 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_2hv_single(float com
h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h2_real = _AVX512_XOR(h2_real, sign);
h2_imag = _AVX512_XOR(h2_imag, sign);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
......@@ -1238,6 +1297,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -1246,7 +1306,13 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
#endif
#endif
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -1257,6 +1323,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h2_real = _AVX512_SET1(hh_dbl[ldh*2]);
h2_imag = _AVX512_SET1(hh_dbl[(ldh*2)+1]);
#ifdef HAVE_AVX512_XEON_PHI
#ifdef DOUBLE_PRECISION_COMPLEX
h1_real = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
......@@ -1274,6 +1341,15 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_2hv_single(float comp
h2_real = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
h2_imag = (__AVX512_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
#endif
#endif
#ifdef HAVE_AVX512_XEON
#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h2_real = _AVX512_XOR(h2_real, sign);
h2_imag = _AVX512_XOR(h2_imag, sign);
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp2 = _AVX512_SET(s_dbl[1], s_dbl[0],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment