Allow to discriminate between AVX and AVX2 kernels

parent 69792b15
...@@ -143,6 +143,10 @@ install_real_generic_simple=yes ...@@ -143,6 +143,10 @@ install_real_generic_simple=yes
install_complex_generic=yes install_complex_generic=yes
install_complex_generic_simple=yes install_complex_generic_simple=yes
#want_avx=yes
#want_avx2=yes
#want_sse=yes
AC_LANG([C]) AC_LANG([C])
dnl build with ftimings support dnl build with ftimings support
...@@ -300,8 +304,6 @@ if test "${can_compile_avx}" = "yes" ; then ...@@ -300,8 +304,6 @@ if test "${can_compile_avx}" = "yes" ; then
install_complex_avx_block1=yes install_complex_avx_block1=yes
install_complex_avx_block2=yes install_complex_avx_block2=yes
want_avx=yes
else else
install_real_avx_block2=no install_real_avx_block2=no
install_real_avx_block4=no install_real_avx_block4=no
...@@ -309,10 +311,23 @@ else ...@@ -309,10 +311,23 @@ else
install_complex_avx_block1=no install_complex_avx_block1=no
install_complex_avx_block2=no install_complex_avx_block2=no
want_avx=yes
fi fi
if test "${can_compile_avx2}" = "yes" ; then
install_real_avx2_block2=yes
install_real_avx2_block4=yes
install_real_avx2_block6=yes
install_complex_avx2_block1=yes
install_complex_avx2_block2=yes
else
install_real_avx2_block2=no
install_real_avx2_block4=no
install_real_avx2_block6=no
install_complex_avx2_block1=no
install_complex_avx2_block2=no
fi
AM_CONDITIONAL([HAVE_SSE],[test x"$can_compile_sse" = x"yes"]) AM_CONDITIONAL([HAVE_SSE],[test x"$can_compile_sse" = x"yes"])
if test x"${can_compile_sse}" = x"yes" ; then if test x"${can_compile_sse}" = x"yes" ; then
AC_DEFINE([HAVE_SSE],[1],[SSE is supported on this CPU]) AC_DEFINE([HAVE_SSE],[1],[SSE is supported on this CPU])
...@@ -635,6 +650,7 @@ dnl complex kernels ...@@ -635,6 +650,7 @@ dnl complex kernels
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block2-kernel-only],[complex-avx-block2-kernel],[install_complex_avx_block2]) DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block2-kernel-only],[complex-avx-block2-kernel],[install_complex_avx_block2])
dnl set the conditionals according to the previous tests dnl set the conditionals according to the previous tests
if test x"${can_use_iso_fortran_env}" = x"yes" ; then if test x"${can_use_iso_fortran_env}" = x"yes" ; then
AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env]) AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env])
fi fi
...@@ -699,6 +715,21 @@ if test x"${install_real_avx_block6}" = x"yes" ; then ...@@ -699,6 +715,21 @@ if test x"${install_real_avx_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX_BLOCK6_KERNEL],[1],[can use real_avx_block6 kernel]) AC_DEFINE([WITH_REAL_AVX_BLOCK6_KERNEL],[1],[can use real_avx_block6 kernel])
fi fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK2_KERNEL],[test x"$install_real_avx2_block2" = x"yes"])
if test x"${install_real_avx2_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK2_KERNEL],[1],[can use real_avx2_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK4_KERNEL],[test x"$install_real_avx2_block4" = x"yes"])
if test x"${install_real_avx2_block4}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK4_KERNEL],[1],[can use real_avx2_block4 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK6_KERNEL],[test x"$install_real_avx2_block6" = x"yes"])
if test x"${install_real_avx2_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK6_KERNEL],[1],[can use real_avx2_block6 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[test x"$install_complex_sse_block1" = x"yes"]) AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[test x"$install_complex_sse_block1" = x"yes"])
if test x"${install_complex_sse_block1}" = x"yes" ; then if test x"${install_complex_sse_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[1],[can use complex_sse_block1 kernel]) AC_DEFINE([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[1],[can use complex_sse_block1 kernel])
...@@ -719,6 +750,16 @@ if test x"${install_complex_avx_block2}" = x"yes" ; then ...@@ -719,6 +750,16 @@ if test x"${install_complex_avx_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX_BLOCK2_KERNEL],[1],[can use complex_avx_block2 kernel]) AC_DEFINE([WITH_COMPLEX_AVX_BLOCK2_KERNEL],[1],[can use complex_avx_block2 kernel])
fi fi
AM_CONDITIONAL([WITH_COMPLEX_AVX2_BLOCK1_KERNEL],[test x"$install_complex_avx2_block1" = x"yes"])
if test x"${install_complex_avx2_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK1_KERNEL],[1],[can use complex_avx2_block1 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[test x"$install_complex_avx2_block2" = x"yes"])
if test x"${install_complex_avx2_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[1],[can use complex_avx2_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_BGP_KERNEL],[test x"$install_real_bgp" = x"yes"]) AM_CONDITIONAL([WITH_REAL_BGP_KERNEL],[test x"$install_real_bgp" = x"yes"])
if test x"${install_real_bgp}" = x"yes" ; then if test x"${install_real_bgp}" = x"yes" ; then
AC_DEFINE([WITH_REAL_BGP_KERNEL],[1],[can use real BGP kernel]) AC_DEFINE([WITH_REAL_BGP_KERNEL],[1],[can use real BGP kernel])
...@@ -816,13 +857,13 @@ mkdir -p test/shared_sources ...@@ -816,13 +857,13 @@ mkdir -p test/shared_sources
grep -h "^ *!c>" $srcdir/test/shared_sources/*.F90 | sed 's/^ *!c>//;' > test/shared_sources/generated.h || exit 1 grep -h "^ *!c>" $srcdir/test/shared_sources/*.F90 | sed 's/^ *!c>//;' > test/shared_sources/generated.h || exit 1
if test "${can_compile_avx}" = "no" ; then if test "${can_compile_avx}" = "no" ; then
if test x"${want_avx}" = x"yes" ; then # if test x"${want_avx}" = x"yes" ; then
AC_MSG_WARN([Could not compile AVX instructions]) AC_MSG_WARN([Could not compile AVX instructions])
fi # fi
fi fi
if test "${can_compile_avx2}" = "no" ; then if test "${can_compile_avx2}" = "no" ; then
if test x"${want_avx}" = x"yes" ; then # if test x"${want_avx2}" = x"yes" ; then
AC_MSG_WARN([Could not compile AVX2 instructions]) AC_MSG_WARN([Could not compile AVX2 instructions])
fi # fi
fi fi
...@@ -9,8 +9,11 @@ ...@@ -9,8 +9,11 @@
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 9 #define ELPA2_REAL_KERNEL_AVX_BLOCK2 9
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 10 #define ELPA2_REAL_KERNEL_AVX_BLOCK4 10
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 11 #define ELPA2_REAL_KERNEL_AVX_BLOCK6 11
#define ELPA2_REAL_KERNEL_AVX2_BLOCK2 12
#define ELPA2_REAL_KERNEL_AVX2_BLOCK4 13
#define ELPA2_REAL_KERNEL_AVX2_BLOCK6 14
#define ELPA2_NUMBER_OF_REAL_KERNELS 11 #define ELPA2_NUMBER_OF_REAL_KERNELS 14
#define ELPA2_COMPLEX_KERNEL_GENERIC 1 #define ELPA2_COMPLEX_KERNEL_GENERIC 1
...@@ -22,5 +25,8 @@ ...@@ -22,5 +25,8 @@
#define ELPA2_COMPLEX_KERNEL_SSE_BLOCK2 7 #define ELPA2_COMPLEX_KERNEL_SSE_BLOCK2 7
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 8 #define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 8
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 9 #define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 9
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK1 10
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2 11
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 9
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 11
...@@ -72,15 +72,18 @@ module ELPA2_utilities ...@@ -72,15 +72,18 @@ module ELPA2_utilities
public :: REAL_ELPA_KERNEL_GENERIC, REAL_ELPA_KERNEL_GENERIC_SIMPLE, & public :: REAL_ELPA_KERNEL_GENERIC, REAL_ELPA_KERNEL_GENERIC_SIMPLE, &
REAL_ELPA_KERNEL_BGP, REAL_ELPA_KERNEL_BGQ, & REAL_ELPA_KERNEL_BGP, REAL_ELPA_KERNEL_BGQ, &
REAL_ELPA_KERNEL_SSE, REAL_ELPA_KERNEL_SSE_BLOCK2, & REAL_ELPA_KERNEL_SSE, REAL_ELPA_KERNEL_SSE_BLOCK2, &
REAL_ELPA_KERNEL_AVX_BLOCK2, &
REAL_ELPA_KERNEL_SSE_BLOCK4, REAL_ELPA_KERNEL_SSE_BLOCK6, & REAL_ELPA_KERNEL_SSE_BLOCK4, REAL_ELPA_KERNEL_SSE_BLOCK6, &
REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6 REAL_ELPA_KERNEL_AVX_BLOCK2, &
REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6, &
REAL_ELPA_KERNEL_AVX2_BLOCK2, &
REAL_ELPA_KERNEL_AVX2_BLOCK4, REAL_ELPA_KERNEL_AVX2_BLOCK6
public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, & public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, &
COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ, & COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ, &
COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_SSE_BLOCK1, & COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_SSE_BLOCK1, &
COMPLEX_ELPA_KERNEL_SSE_BLOCK2, & COMPLEX_ELPA_KERNEL_SSE_BLOCK2, &
COMPLEX_ELPA_KERNEL_AVX_BLOCK1,COMPLEX_ELPA_KERNEL_AVX_BLOCK2 COMPLEX_ELPA_KERNEL_AVX_BLOCK1,COMPLEX_ELPA_KERNEL_AVX_BLOCK2, &
COMPLEX_ELPA_KERNEL_AVX2_BLOCK1,COMPLEX_ELPA_KERNEL_AVX2_BLOCK2
public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES
...@@ -106,6 +109,9 @@ module ELPA2_utilities ...@@ -106,6 +109,9 @@ module ELPA2_utilities
integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_REAL_KERNEL_AVX_BLOCK2 integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_REAL_KERNEL_AVX_BLOCK2
integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK4 = ELPA2_REAL_KERNEL_AVX_BLOCK4 integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK4 = ELPA2_REAL_KERNEL_AVX_BLOCK4
integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK6 = ELPA2_REAL_KERNEL_AVX_BLOCK6 integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK6 = ELPA2_REAL_KERNEL_AVX_BLOCK6
integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK2 = ELPA2_REAL_KERNEL_AVX2_BLOCK2
integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK4 = ELPA2_REAL_KERNEL_AVX2_BLOCK4
integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK6 = ELPA2_REAL_KERNEL_AVX2_BLOCK6
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) #if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
...@@ -123,7 +129,10 @@ module ELPA2_utilities ...@@ -123,7 +129,10 @@ module ELPA2_utilities
"REAL_ELPA_KERNEL_SSE_BLOCK6 ", & "REAL_ELPA_KERNEL_SSE_BLOCK6 ", &
"REAL_ELPA_KERNEL_AVX_BLOCK2 ", & "REAL_ELPA_KERNEL_AVX_BLOCK2 ", &
"REAL_ELPA_KERNEL_AVX_BLOCK4 ", & "REAL_ELPA_KERNEL_AVX_BLOCK4 ", &
"REAL_ELPA_KERNEL_AVX_BLOCK6 "/) "REAL_ELPA_KERNEL_AVX_BLOCK6 ", &
"REAL_ELPA_KERNEL_AVX2_BLOCK2 ", &
"REAL_ELPA_KERNEL_AVX2_BLOCK4 ", &
"REAL_ELPA_KERNEL_AVX2_BLOCK6 "/)
integer, parameter :: number_of_complex_kernels = ELPA2_NUMBER_OF_COMPLEX_KERNELS integer, parameter :: number_of_complex_kernels = ELPA2_NUMBER_OF_COMPLEX_KERNELS
integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC = ELPA2_COMPLEX_KERNEL_GENERIC integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC = ELPA2_COMPLEX_KERNEL_GENERIC
...@@ -135,6 +144,8 @@ module ELPA2_utilities ...@@ -135,6 +144,8 @@ module ELPA2_utilities
integer, parameter :: COMPLEX_ELPA_KERNEL_SSE_BLOCK2 = ELPA2_COMPLEX_KERNEL_SSE_BLOCK2 integer, parameter :: COMPLEX_ELPA_KERNEL_SSE_BLOCK2 = ELPA2_COMPLEX_KERNEL_SSE_BLOCK2
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK1 = ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK1 = ELPA2_COMPLEX_KERNEL_AVX_BLOCK1
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX_BLOCK2
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX2_BLOCK1 = ELPA2_COMPLEX_KERNEL_AVX2_BLOCK1
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX2_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) #if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
...@@ -150,7 +161,9 @@ module ELPA2_utilities ...@@ -150,7 +161,9 @@ module ELPA2_utilities
"COMPLEX_ELPA_KERNEL_SSE_BLOCK1 ", & "COMPLEX_ELPA_KERNEL_SSE_BLOCK1 ", &
"COMPLEX_ELPA_KERNEL_SSE_BLOCK2 ", & "COMPLEX_ELPA_KERNEL_SSE_BLOCK2 ", &
"COMPLEX_ELPA_KERNEL_AVX_BLOCK1 ", & "COMPLEX_ELPA_KERNEL_AVX_BLOCK1 ", &
"COMPLEX_ELPA_KERNEL_AVX_BLOCK2 "/) "COMPLEX_ELPA_KERNEL_AVX_BLOCK2 ", &
"COMPLEX_ELPA_KERNEL_AVX2_BLOCK1 ", &
"COMPLEX_ELPA_KERNEL_AVX2_BLOCK2 "/)
integer, parameter :: & integer, parameter :: &
AVAILABLE_REAL_ELPA_KERNELS(number_of_real_kernels) = & AVAILABLE_REAL_ELPA_KERNELS(number_of_real_kernels) = &
...@@ -211,6 +224,23 @@ module ELPA2_utilities ...@@ -211,6 +224,23 @@ module ELPA2_utilities
#else #else
,0 & ,0 &
#endif #endif
#if WITH_REAL_AVX2_BLOCK2_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_REAL_AVX2_BLOCK4_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_REAL_AVX2_BLOCK6_KERNEL
,1 &
#else
,0 &
#endif
/) /)
integer, parameter :: & integer, parameter :: &
...@@ -262,6 +292,17 @@ module ELPA2_utilities ...@@ -262,6 +292,17 @@ module ELPA2_utilities
#else #else
,0 & ,0 &
#endif #endif
#if WITH_COMPLEX_AVX2_BLOCK1_KERNEL
,1 &
#else
,0 &
#endif
#if WITH_COMPLEX_AVX2_BLOCK2_KERNEL
,1 &
#else
,0 &
#endif
/) /)
!****** !******
......
...@@ -118,9 +118,10 @@ module compute_hh_trafo_complex ...@@ -118,9 +118,10 @@ module compute_hh_trafo_complex
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */ #endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */
#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) #if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) then if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK2) ) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt = mpi_wtime() ttt = mpi_wtime()
do j = ncols, 2, -2 do j = ncols, 2, -2
...@@ -260,9 +261,10 @@ module compute_hh_trafo_complex ...@@ -260,9 +261,10 @@ module compute_hh_trafo_complex
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNE */ #endif /* WITH_COMPLEX_SSE_BLOCK1_KERNE */
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) #if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) then if ((THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK1)) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt = mpi_wtime() ttt = mpi_wtime()
do j = ncols, 1, -1 do j = ncols, 1, -1
...@@ -277,7 +279,7 @@ module compute_hh_trafo_complex ...@@ -277,7 +279,7 @@ module compute_hh_trafo_complex
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) #if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */ #endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNE */ #endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
if (my_thread==1) then if (my_thread==1) then
......
...@@ -104,6 +104,7 @@ module compute_hh_trafo_real ...@@ -104,6 +104,7 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2 .or. & if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2 .or. & THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC .or. & THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE .or. & THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE .or. &
...@@ -229,9 +230,10 @@ module compute_hh_trafo_real ...@@ -229,9 +230,10 @@ module compute_hh_trafo_real
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_SSE_BLOCK2_KERNEL */ #endif /* WITH_REAL_SSE_BLOCK2_KERNEL */
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX2_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) then if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK2)) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
do j = ncols, 2, -2 do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off) w(:,1) = bcast_buffer(1:nbw,j+off)
...@@ -353,9 +355,10 @@ module compute_hh_trafo_real ...@@ -353,9 +355,10 @@ module compute_hh_trafo_real
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_SSE_BLOCK4_KERNEL */ #endif /* WITH_REAL_SSE_BLOCK4_KERNEL */
#if defined(WITH_REAL_AVX_BLOCK4_KERNEL) #if defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX2_BLOCK4_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) then if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK4)) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4 do j = ncols, 4, -4
...@@ -450,9 +453,10 @@ module compute_hh_trafo_real ...@@ -450,9 +453,10 @@ module compute_hh_trafo_real
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_SSE_BLOCK4_KERNEL */ #endif /* WITH_REAL_SSE_BLOCK4_KERNEL */
#if defined(WITH_REAL_AVX_BLOCK6_KERNEL) #if defined(WITH_REAL_AVX_BLOCK6_KERNEL) || defined(WITH_REAL_AVX2_BLOCK6_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK6) then if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK6) .or. &
(THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK6)) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */ #endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
do j = ncols, 6, -6 do j = ncols, 6, -6
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment