Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
ebc097eb
Unverified
Commit
ebc097eb
authored
Apr 06, 2016
by
Andreas Marek
Browse files
Allow to discriminate between AVX and AVX2 kernels
parent
69792b15
Changes
5
Hide whitespace changes
Inline
Side-by-side
configure.ac
View file @
ebc097eb
...
...
@@ -143,6 +143,10 @@ install_real_generic_simple=yes
install_complex_generic=yes
install_complex_generic_simple=yes
#want_avx=yes
#want_avx2=yes
#want_sse=yes
AC_LANG([C])
dnl build with ftimings support
...
...
@@ -300,8 +304,6 @@ if test "${can_compile_avx}" = "yes" ; then
install_complex_avx_block1=yes
install_complex_avx_block2=yes
want_avx=yes
else
install_real_avx_block2=no
install_real_avx_block4=no
...
...
@@ -309,10 +311,23 @@ else
install_complex_avx_block1=no
install_complex_avx_block2=no
want_avx=yes
fi
if test "${can_compile_avx2}" = "yes" ; then
install_real_avx2_block2=yes
install_real_avx2_block4=yes
install_real_avx2_block6=yes
install_complex_avx2_block1=yes
install_complex_avx2_block2=yes
else
install_real_avx2_block2=no
install_real_avx2_block4=no
install_real_avx2_block6=no
install_complex_avx2_block1=no
install_complex_avx2_block2=no
fi
AM_CONDITIONAL([HAVE_SSE],[test x"$can_compile_sse" = x"yes"])
if test x"${can_compile_sse}" = x"yes" ; then
AC_DEFINE([HAVE_SSE],[1],[SSE is supported on this CPU])
...
...
@@ -635,6 +650,7 @@ dnl complex kernels
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block2-kernel-only],[complex-avx-block2-kernel],[install_complex_avx_block2])
dnl set the conditionals according to the previous tests
if test x"${can_use_iso_fortran_env}" = x"yes" ; then
AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env])
fi
...
...
@@ -699,6 +715,21 @@ if test x"${install_real_avx_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX_BLOCK6_KERNEL],[1],[can use real_avx_block6 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK2_KERNEL],[test x"$install_real_avx2_block2" = x"yes"])
if test x"${install_real_avx2_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK2_KERNEL],[1],[can use real_avx2_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK4_KERNEL],[test x"$install_real_avx2_block4" = x"yes"])
if test x"${install_real_avx2_block4}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK4_KERNEL],[1],[can use real_avx2_block4 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK6_KERNEL],[test x"$install_real_avx2_block6" = x"yes"])
if test x"${install_real_avx2_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK6_KERNEL],[1],[can use real_avx2_block6 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[test x"$install_complex_sse_block1" = x"yes"])
if test x"${install_complex_sse_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[1],[can use complex_sse_block1 kernel])
...
...
@@ -719,6 +750,16 @@ if test x"${install_complex_avx_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX_BLOCK2_KERNEL],[1],[can use complex_avx_block2 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX2_BLOCK1_KERNEL],[test x"$install_complex_avx2_block1" = x"yes"])
if test x"${install_complex_avx2_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK1_KERNEL],[1],[can use complex_avx2_block1 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[test x"$install_complex_avx2_block2" = x"yes"])
if test x"${install_complex_avx2_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[1],[can use complex_avx2_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_BGP_KERNEL],[test x"$install_real_bgp" = x"yes"])
if test x"${install_real_bgp}" = x"yes" ; then
AC_DEFINE([WITH_REAL_BGP_KERNEL],[1],[can use real BGP kernel])
...
...
@@ -816,13 +857,13 @@ mkdir -p test/shared_sources
grep -h "^ *!c>" $srcdir/test/shared_sources/*.F90 | sed 's/^ *!c>//;' > test/shared_sources/generated.h || exit 1
if test "${can_compile_avx}" = "no" ; then
if test x"${want_avx}" = x"yes" ; then
#
if test x"${want_avx}" = x"yes" ; then
AC_MSG_WARN([Could not compile AVX instructions])
fi
#
fi
fi
if test "${can_compile_avx2}" = "no" ; then
if test x"${want_avx}" = x"yes" ; then
#
if test x"${want_avx
2
}" = x"yes" ; then
AC_MSG_WARN([Could not compile AVX2 instructions])
fi
#
fi
fi
elpa/elpa_kernel_constants.h
View file @
ebc097eb
...
...
@@ -9,8 +9,11 @@
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 9
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 10
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 11
#define ELPA2_REAL_KERNEL_AVX2_BLOCK2 12
#define ELPA2_REAL_KERNEL_AVX2_BLOCK4 13
#define ELPA2_REAL_KERNEL_AVX2_BLOCK6 14
#define ELPA2_NUMBER_OF_REAL_KERNELS 1
1
#define ELPA2_NUMBER_OF_REAL_KERNELS 1
4
#define ELPA2_COMPLEX_KERNEL_GENERIC 1
...
...
@@ -22,5 +25,8 @@
#define ELPA2_COMPLEX_KERNEL_SSE_BLOCK2 7
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 8
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 9
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK1 10
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2 11
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 9
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 11
src/elpa2_utilities.F90
View file @
ebc097eb
...
...
@@ -72,15 +72,18 @@ module ELPA2_utilities
public
::
REAL_ELPA_KERNEL_GENERIC
,
REAL_ELPA_KERNEL_GENERIC_SIMPLE
,
&
REAL_ELPA_KERNEL_BGP
,
REAL_ELPA_KERNEL_BGQ
,
&
REAL_ELPA_KERNEL_SSE
,
REAL_ELPA_KERNEL_SSE_BLOCK2
,
&
REAL_ELPA_KERNEL_AVX_BLOCK2
,
&
REAL_ELPA_KERNEL_SSE_BLOCK4
,
REAL_ELPA_KERNEL_SSE_BLOCK6
,
&
REAL_ELPA_KERNEL_AVX_BLOCK4
,
REAL_ELPA_KERNEL_AVX_BLOCK6
REAL_ELPA_KERNEL_AVX_BLOCK2
,
&
REAL_ELPA_KERNEL_AVX_BLOCK4
,
REAL_ELPA_KERNEL_AVX_BLOCK6
,
&
REAL_ELPA_KERNEL_AVX2_BLOCK2
,
&
REAL_ELPA_KERNEL_AVX2_BLOCK4
,
REAL_ELPA_KERNEL_AVX2_BLOCK6
public
::
COMPLEX_ELPA_KERNEL_GENERIC
,
COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE
,
&
COMPLEX_ELPA_KERNEL_BGP
,
COMPLEX_ELPA_KERNEL_BGQ
,
&
COMPLEX_ELPA_KERNEL_SSE
,
COMPLEX_ELPA_KERNEL_SSE_BLOCK1
,
&
COMPLEX_ELPA_KERNEL_SSE_BLOCK2
,
&
COMPLEX_ELPA_KERNEL_AVX_BLOCK1
,
COMPLEX_ELPA_KERNEL_AVX_BLOCK2
COMPLEX_ELPA_KERNEL_AVX_BLOCK1
,
COMPLEX_ELPA_KERNEL_AVX_BLOCK2
,
&
COMPLEX_ELPA_KERNEL_AVX2_BLOCK1
,
COMPLEX_ELPA_KERNEL_AVX2_BLOCK2
public
::
REAL_ELPA_KERNEL_NAMES
,
COMPLEX_ELPA_KERNEL_NAMES
...
...
@@ -106,6 +109,9 @@ module ELPA2_utilities
integer
,
parameter
::
REAL_ELPA_KERNEL_AVX_BLOCK2
=
ELPA2_REAL_KERNEL_AVX_BLOCK2
integer
,
parameter
::
REAL_ELPA_KERNEL_AVX_BLOCK4
=
ELPA2_REAL_KERNEL_AVX_BLOCK4
integer
,
parameter
::
REAL_ELPA_KERNEL_AVX_BLOCK6
=
ELPA2_REAL_KERNEL_AVX_BLOCK6
integer
,
parameter
::
REAL_ELPA_KERNEL_AVX2_BLOCK2
=
ELPA2_REAL_KERNEL_AVX2_BLOCK2
integer
,
parameter
::
REAL_ELPA_KERNEL_AVX2_BLOCK4
=
ELPA2_REAL_KERNEL_AVX2_BLOCK4
integer
,
parameter
::
REAL_ELPA_KERNEL_AVX2_BLOCK6
=
ELPA2_REAL_KERNEL_AVX2_BLOCK6
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
integer
,
parameter
::
DEFAULT_REAL_ELPA_KERNEL
=
REAL_ELPA_KERNEL_GENERIC
...
...
@@ -123,7 +129,10 @@ module ELPA2_utilities
"REAL_ELPA_KERNEL_SSE_BLOCK6 "
,
&
"REAL_ELPA_KERNEL_AVX_BLOCK2 "
,
&
"REAL_ELPA_KERNEL_AVX_BLOCK4 "
,
&
"REAL_ELPA_KERNEL_AVX_BLOCK6 "
/)
"REAL_ELPA_KERNEL_AVX_BLOCK6 "
,
&
"REAL_ELPA_KERNEL_AVX2_BLOCK2 "
,
&
"REAL_ELPA_KERNEL_AVX2_BLOCK4 "
,
&
"REAL_ELPA_KERNEL_AVX2_BLOCK6 "
/)
integer
,
parameter
::
number_of_complex_kernels
=
ELPA2_NUMBER_OF_COMPLEX_KERNELS
integer
,
parameter
::
COMPLEX_ELPA_KERNEL_GENERIC
=
ELPA2_COMPLEX_KERNEL_GENERIC
...
...
@@ -135,6 +144,8 @@ module ELPA2_utilities
integer
,
parameter
::
COMPLEX_ELPA_KERNEL_SSE_BLOCK2
=
ELPA2_COMPLEX_KERNEL_SSE_BLOCK2
integer
,
parameter
::
COMPLEX_ELPA_KERNEL_AVX_BLOCK1
=
ELPA2_COMPLEX_KERNEL_AVX_BLOCK1
integer
,
parameter
::
COMPLEX_ELPA_KERNEL_AVX_BLOCK2
=
ELPA2_COMPLEX_KERNEL_AVX_BLOCK2
integer
,
parameter
::
COMPLEX_ELPA_KERNEL_AVX2_BLOCK1
=
ELPA2_COMPLEX_KERNEL_AVX2_BLOCK1
integer
,
parameter
::
COMPLEX_ELPA_KERNEL_AVX2_BLOCK2
=
ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
integer
,
parameter
::
DEFAULT_COMPLEX_ELPA_KERNEL
=
COMPLEX_ELPA_KERNEL_GENERIC
...
...
@@ -150,7 +161,9 @@ module ELPA2_utilities
"COMPLEX_ELPA_KERNEL_SSE_BLOCK1 "
,
&
"COMPLEX_ELPA_KERNEL_SSE_BLOCK2 "
,
&
"COMPLEX_ELPA_KERNEL_AVX_BLOCK1 "
,
&
"COMPLEX_ELPA_KERNEL_AVX_BLOCK2 "
/)
"COMPLEX_ELPA_KERNEL_AVX_BLOCK2 "
,
&
"COMPLEX_ELPA_KERNEL_AVX2_BLOCK1 "
,
&
"COMPLEX_ELPA_KERNEL_AVX2_BLOCK2 "
/)
integer
,
parameter
::
&
AVAILABLE_REAL_ELPA_KERNELS
(
number_of_real_kernels
)
=
&
...
...
@@ -211,6 +224,23 @@ module ELPA2_utilities
#else
,
0
&
#endif
#if WITH_REAL_AVX2_BLOCK2_KERNEL
,
1
&
#else
,
0
&
#endif
#if WITH_REAL_AVX2_BLOCK4_KERNEL
,
1
&
#else
,
0
&
#endif
#if WITH_REAL_AVX2_BLOCK6_KERNEL
,
1
&
#else
,
0
&
#endif
/)
integer
,
parameter
::
&
...
...
@@ -262,6 +292,17 @@ module ELPA2_utilities
#else
,
0
&
#endif
#if WITH_COMPLEX_AVX2_BLOCK1_KERNEL
,
1
&
#else
,
0
&
#endif
#if WITH_COMPLEX_AVX2_BLOCK2_KERNEL
,
1
&
#else
,
0
&
#endif
/)
!******
...
...
src/mod_compute_hh_trafo_complex.F90
View file @
ebc097eb
...
...
@@ -118,9 +118,10 @@ module compute_hh_trafo_complex
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */
#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)
#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)
|| defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if
(
THIS_COMPLEX_ELPA_KERNEL
.eq.
COMPLEX_ELPA_KERNEL_AVX_BLOCK2
)
then
if
(
(
THIS_COMPLEX_ELPA_KERNEL
.eq.
COMPLEX_ELPA_KERNEL_AVX_BLOCK2
)
.or.
&
(
THIS_COMPLEX_ELPA_KERNEL
.eq.
COMPLEX_ELPA_KERNEL_AVX2_BLOCK2
)
)
then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt
=
mpi_wtime
()
do
j
=
ncols
,
2
,
-2
...
...
@@ -260,9 +261,10 @@ module compute_hh_trafo_complex
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNE */
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
|| defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if
(
THIS_COMPLEX_ELPA_KERNEL
.eq.
COMPLEX_ELPA_KERNEL_AVX_BLOCK1
)
then
if
((
THIS_COMPLEX_ELPA_KERNEL
.eq.
COMPLEX_ELPA_KERNEL_AVX_BLOCK1
)
.or.
&
(
THIS_COMPLEX_ELPA_KERNEL
.eq.
COMPLEX_ELPA_KERNEL_AVX2_BLOCK1
))
then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt
=
mpi_wtime
()
do
j
=
ncols
,
1
,
-1
...
...
@@ -277,7 +279,7 @@ module compute_hh_trafo_complex
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNE */
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNE
L
*/
#ifdef WITH_OPENMP
if
(
my_thread
==
1
)
then
...
...
src/mod_compute_hh_trafo_real.F90
View file @
ebc097eb
...
...
@@ -104,6 +104,7 @@ module compute_hh_trafo_real
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if
(
THIS_REAL_ELPA_KERNEL
.eq.
REAL_ELPA_KERNEL_AVX_BLOCK2
.or.
&
THIS_REAL_ELPA_KERNEL
.eq.
REAL_ELPA_KERNEL_AVX2_BLOCK2
.or.
&
THIS_REAL_ELPA_KERNEL
.eq.
REAL_ELPA_KERNEL_SSE_BLOCK2
.or.
&
THIS_REAL_ELPA_KERNEL
.eq.
REAL_ELPA_KERNEL_GENERIC
.or.
&
THIS_REAL_ELPA_KERNEL
.eq.
REAL_ELPA_KERNEL_GENERIC_SIMPLE
.or.
&
...
...
@@ -229,9 +230,10 @@ module compute_hh_trafo_real
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_SSE_BLOCK2_KERNEL */
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
|| defined(WITH_REAL_AVX2_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if
(
THIS_REAL_ELPA_KERNEL
.eq.
REAL_ELPA_KERNEL_AVX_BLOCK2
)
then
if
((
THIS_REAL_ELPA_KERNEL
.eq.
REAL_ELPA_KERNEL_AVX_BLOCK2
)
.or.
&
(
THIS_REAL_ELPA_KERNEL
.eq.
REAL_ELPA_KERNEL_AVX2_BLOCK2
))
then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
do
j
=
ncols
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
...
...
@@ -353,9 +355,10 @@ module compute_hh_trafo_real
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_SSE_BLOCK4_KERNEL */
#if defined(WITH_REAL_AVX_BLOCK4_KERNEL)
#if defined(WITH_REAL_AVX_BLOCK4_KERNEL)
|| defined(WITH_REAL_AVX2_BLOCK4_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if
(
THIS_REAL_ELPA_KERNEL
.eq.
REAL_ELPA_KERNEL_AVX_BLOCK4
)
then
if
((
THIS_REAL_ELPA_KERNEL
.eq.
REAL_ELPA_KERNEL_AVX_BLOCK4
)
.or.
&
(
THIS_REAL_ELPA_KERNEL
.eq.
REAL_ELPA_KERNEL_AVX2_BLOCK4
))
then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do
j
=
ncols
,
4
,
-4
...
...
@@ -450,9 +453,10 @@ module compute_hh_trafo_real
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_SSE_BLOCK4_KERNEL */
#if defined(WITH_REAL_AVX_BLOCK6_KERNEL)
#if defined(WITH_REAL_AVX_BLOCK6_KERNEL)
|| defined(WITH_REAL_AVX2_BLOCK6_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if
(
THIS_REAL_ELPA_KERNEL
.eq.
REAL_ELPA_KERNEL_AVX_BLOCK6
)
then
if
((
THIS_REAL_ELPA_KERNEL
.eq.
REAL_ELPA_KERNEL_AVX_BLOCK6
)
.or.
&
(
THIS_REAL_ELPA_KERNEL
.eq.
REAL_ELPA_KERNEL_AVX2_BLOCK6
))
then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
do
j
=
ncols
,
6
,
-6
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment