Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
6e86364f
Unverified
Commit
6e86364f
authored
Apr 08, 2016
by
Lorenz Hüdepohl
Browse files
Merge branch 'master' of git@gitlab.mpcdf.mpg.de:elpa/elpa.git
parents
59e405e0
ebc097eb
Changes
18
Expand all
Hide whitespace changes
Inline
Side-by-side
Makefile.am
View file @
6e86364f
...
@@ -80,31 +80,46 @@ if WITH_COMPLEX_SSE_KERNEL
...
@@ -80,31 +80,46 @@ if WITH_COMPLEX_SSE_KERNEL
endif
endif
endif
endif
if
WITH_REAL_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c
endif
if
WITH_REAL_AVX_BLOCK2_KERNEL
if
WITH_REAL_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c
endif
endif
if
WITH_REAL_SSE_BLOCK4_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c
endif
if
WITH_REAL_AVX_BLOCK4_KERNEL
if
WITH_REAL_AVX_BLOCK4_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c
endif
endif
if
WITH_REAL_SSE_BLOCK6_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c
endif
if
WITH_REAL_AVX_BLOCK6_KERNEL
if
WITH_REAL_AVX_BLOCK6_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c
endif
endif
if
WITH_COMPLEX_SSE_BLOCK1_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp
endif
if
WITH_COMPLEX_AVX_BLOCK1_KERNEL
if
WITH_COMPLEX_AVX_BLOCK1_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp
endif
endif
if
WITH_COMPLEX_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp
endif
if
WITH_COMPLEX_AVX_BLOCK2_KERNEL
if
WITH_COMPLEX_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp
endif
endif
#if WITH_AVX_SANDYBRIDGE
# libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c \
# src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp
#endif
# install any .mod files in the include/ dir
# install any .mod files in the include/ dir
elpa_includedir
=
$(includedir)
/elpa@SUFFIX@-@PACKAGE_VERSION@
elpa_includedir
=
$(includedir)
/elpa@SUFFIX@-@PACKAGE_VERSION@
nobase_elpa_include_HEADERS
=
$(
wildcard
modules/
*
)
nobase_elpa_include_HEADERS
=
$(
wildcard
modules/
*
)
...
...
configure.ac
View file @
6e86364f
...
@@ -143,6 +143,10 @@ install_real_generic_simple=yes
...
@@ -143,6 +143,10 @@ install_real_generic_simple=yes
install_complex_generic=yes
install_complex_generic=yes
install_complex_generic_simple=yes
install_complex_generic_simple=yes
#want_avx=yes
#want_avx2=yes
#want_sse=yes
AC_LANG([C])
AC_LANG([C])
dnl build with ftimings support
dnl build with ftimings support
...
@@ -196,12 +200,26 @@ $CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s -o test.o 2>/dev/nul
...
@@ -196,12 +200,26 @@ $CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s -o test.o 2>/dev/nul
if test "$?" == 0; then
if test "$?" == 0; then
can_compile_sse=yes
can_compile_sse=yes
install_real_sse=yes
install_real_sse=yes
install_real_sse_block2=yes
install_real_sse_block4=yes
install_real_sse_block6=yes
install_complex_sse=yes
install_complex_sse=yes
install_complex_sse_block1=yes
install_complex_sse_block2=yes
else
else
can_compile_sse=no
can_compile_sse=no
install_real_sse=no
install_real_sse=no
install_real_sse_block2=no
install_real_sse_block4=no
install_real_sse_block6=no
install_complex_sse=no
install_complex_sse=no
install_complex_sse_block1=no
install_complex_sse_block2=no
fi
fi
rm -f ./test.o
rm -f ./test.o
AC_MSG_RESULT([${can_compile_sse}])
AC_MSG_RESULT([${can_compile_sse}])
...
@@ -286,8 +304,6 @@ if test "${can_compile_avx}" = "yes" ; then
...
@@ -286,8 +304,6 @@ if test "${can_compile_avx}" = "yes" ; then
install_complex_avx_block1=yes
install_complex_avx_block1=yes
install_complex_avx_block2=yes
install_complex_avx_block2=yes
want_avx=yes
else
else
install_real_avx_block2=no
install_real_avx_block2=no
install_real_avx_block4=no
install_real_avx_block4=no
...
@@ -295,10 +311,23 @@ else
...
@@ -295,10 +311,23 @@ else
install_complex_avx_block1=no
install_complex_avx_block1=no
install_complex_avx_block2=no
install_complex_avx_block2=no
want_avx=yes
fi
fi
if test "${can_compile_avx2}" = "yes" ; then
install_real_avx2_block2=yes
install_real_avx2_block4=yes
install_real_avx2_block6=yes
install_complex_avx2_block1=yes
install_complex_avx2_block2=yes
else
install_real_avx2_block2=no
install_real_avx2_block4=no
install_real_avx2_block6=no
install_complex_avx2_block1=no
install_complex_avx2_block2=no
fi
AM_CONDITIONAL([HAVE_SSE],[test x"$can_compile_sse" = x"yes"])
AM_CONDITIONAL([HAVE_SSE],[test x"$can_compile_sse" = x"yes"])
if test x"${can_compile_sse}" = x"yes" ; then
if test x"${can_compile_sse}" = x"yes" ; then
AC_DEFINE([HAVE_SSE],[1],[SSE is supported on this CPU])
AC_DEFINE([HAVE_SSE],[1],[SSE is supported on this CPU])
...
@@ -477,7 +506,6 @@ AC_MSG_CHECKING([whether we can use the intrinsic Fortran function "get_environm
...
@@ -477,7 +506,6 @@ AC_MSG_CHECKING([whether we can use the intrinsic Fortran function "get_environm
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
program test_get_environment
program test_get_environment
character(len=256) :: homedir
character(len=256) :: homedir
call get_environment_variable("HOME",homedir)
call get_environment_variable("HOME",homedir)
end program
end program
...
@@ -570,6 +598,15 @@ dnl real kernels
...
@@ -570,6 +598,15 @@ dnl real kernels
dnl bgq kernel
dnl bgq kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgq-kernel-only],[bgq-kernel],[install_real_bgq])
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgq-kernel-only],[bgq-kernel],[install_real_bgq])
dnl real-sse-block2 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block2-kernel-only],[real-sse-block2-kernel],[install_real_sse_block2])
dnl real-sse-block4 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block4-kernel]-only,[real-sse-block4-kernel],[install_real_sse_block4])
dnl real-sse-block6 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block6-kernel-only],[real-sse-block6-kernel],[install_real_sse_block6])
dnl real-avx-block2 kernel
dnl real-avx-block2 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block2-kernel-only],[real-avx-block2-kernel],[install_real_avx_block2])
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block2-kernel-only],[real-avx-block2-kernel],[install_real_avx_block2])
...
@@ -600,6 +637,12 @@ dnl complex kernels
...
@@ -600,6 +637,12 @@ dnl complex kernels
dnl complex-bqq kernel
dnl complex-bqq kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgq-kernel-only],[bgq-kernel],[install_complex_bgq])
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgq-kernel-only],[bgq-kernel],[install_complex_bgq])
dnl complex-sse-block1 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-block1-kernel-only],[complex-sse-block1-kernel],[install_complex_sse_block1])
dnl complex-avx-block2 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-block2-kernel-only],[complex-sse-block2-kernel],[install_complex_sse_block2])
dnl complex-avx-block1 kernel
dnl complex-avx-block1 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block1-kernel-only],[complex-avx-block1-kernel],[install_complex_avx_block1])
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block1-kernel-only],[complex-avx-block1-kernel],[install_complex_avx_block1])
...
@@ -607,6 +650,7 @@ dnl complex kernels
...
@@ -607,6 +650,7 @@ dnl complex kernels
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block2-kernel-only],[complex-avx-block2-kernel],[install_complex_avx_block2])
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block2-kernel-only],[complex-avx-block2-kernel],[install_complex_avx_block2])
dnl set the conditionals according to the previous tests
dnl set the conditionals according to the previous tests
if test x"${can_use_iso_fortran_env}" = x"yes" ; then
if test x"${can_use_iso_fortran_env}" = x"yes" ; then
AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env])
AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env])
fi
fi
...
@@ -641,6 +685,21 @@ if test x"${install_complex_sse}" = x"yes" ; then
...
@@ -641,6 +685,21 @@ if test x"${install_complex_sse}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_KERNEL],[1],[can use complex SSE kernel])
AC_DEFINE([WITH_COMPLEX_SSE_KERNEL],[1],[can use complex SSE kernel])
fi
fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK2_KERNEL],[test x"$install_real_sse_block2" = x"yes"])
if test x"${install_real_sse_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_BLOCK2_KERNEL],[1],[can use real_sse_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK4_KERNEL],[test x"$install_real_sse_block4" = x"yes"])
if test x"${install_real_sse_block4}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_BLOCK4_KERNEL],[1],[can use real_sse_block4 kernel])
fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK6_KERNEL],[test x"$install_real_sse_block6" = x"yes"])
if test x"${install_real_sse_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_BLOCK6_KERNEL],[1],[can use real_sse_block6 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX_BLOCK2_KERNEL],[test x"$install_real_avx_block2" = x"yes"])
AM_CONDITIONAL([WITH_REAL_AVX_BLOCK2_KERNEL],[test x"$install_real_avx_block2" = x"yes"])
if test x"${install_real_avx_block2}" = x"yes" ; then
if test x"${install_real_avx_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX_BLOCK2_KERNEL],[1],[can use real_avx_block2 kernel])
AC_DEFINE([WITH_REAL_AVX_BLOCK2_KERNEL],[1],[can use real_avx_block2 kernel])
...
@@ -656,6 +715,31 @@ if test x"${install_real_avx_block6}" = x"yes" ; then
...
@@ -656,6 +715,31 @@ if test x"${install_real_avx_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX_BLOCK6_KERNEL],[1],[can use real_avx_block6 kernel])
AC_DEFINE([WITH_REAL_AVX_BLOCK6_KERNEL],[1],[can use real_avx_block6 kernel])
fi
fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK2_KERNEL],[test x"$install_real_avx2_block2" = x"yes"])
if test x"${install_real_avx2_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK2_KERNEL],[1],[can use real_avx2_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK4_KERNEL],[test x"$install_real_avx2_block4" = x"yes"])
if test x"${install_real_avx2_block4}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK4_KERNEL],[1],[can use real_avx2_block4 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK6_KERNEL],[test x"$install_real_avx2_block6" = x"yes"])
if test x"${install_real_avx2_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK6_KERNEL],[1],[can use real_avx2_block6 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[test x"$install_complex_sse_block1" = x"yes"])
if test x"${install_complex_sse_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[1],[can use complex_sse_block1 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK2_KERNEL],[test x"$install_complex_sse_block2" = x"yes"])
if test x"${install_complex_sse_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_BLOCK2_KERNEL],[1],[can use complex_sse_block2 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[test x"$install_complex_avx_block1" = x"yes"])
AM_CONDITIONAL([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[test x"$install_complex_avx_block1" = x"yes"])
if test x"${install_complex_avx_block1}" = x"yes" ; then
if test x"${install_complex_avx_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[1],[can use complex_avx_block1 kernel])
AC_DEFINE([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[1],[can use complex_avx_block1 kernel])
...
@@ -666,6 +750,16 @@ if test x"${install_complex_avx_block2}" = x"yes" ; then
...
@@ -666,6 +750,16 @@ if test x"${install_complex_avx_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX_BLOCK2_KERNEL],[1],[can use complex_avx_block2 kernel])
AC_DEFINE([WITH_COMPLEX_AVX_BLOCK2_KERNEL],[1],[can use complex_avx_block2 kernel])
fi
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX2_BLOCK1_KERNEL],[test x"$install_complex_avx2_block1" = x"yes"])
if test x"${install_complex_avx2_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK1_KERNEL],[1],[can use complex_avx2_block1 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[test x"$install_complex_avx2_block2" = x"yes"])
if test x"${install_complex_avx2_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[1],[can use complex_avx2_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_BGP_KERNEL],[test x"$install_real_bgp" = x"yes"])
AM_CONDITIONAL([WITH_REAL_BGP_KERNEL],[test x"$install_real_bgp" = x"yes"])
if test x"${install_real_bgp}" = x"yes" ; then
if test x"${install_real_bgp}" = x"yes" ; then
AC_DEFINE([WITH_REAL_BGP_KERNEL],[1],[can use real BGP kernel])
AC_DEFINE([WITH_REAL_BGP_KERNEL],[1],[can use real BGP kernel])
...
@@ -763,13 +857,13 @@ mkdir -p test/shared_sources
...
@@ -763,13 +857,13 @@ mkdir -p test/shared_sources
grep -h "^ *!c>" $srcdir/test/shared_sources/*.F90 | sed 's/^ *!c>//;' > test/shared_sources/generated.h || exit 1
grep -h "^ *!c>" $srcdir/test/shared_sources/*.F90 | sed 's/^ *!c>//;' > test/shared_sources/generated.h || exit 1
if test "${can_compile_avx}" = "no" ; then
if test "${can_compile_avx}" = "no" ; then
if test x"${want_avx}" = x"yes" ; then
#
if test x"${want_avx}" = x"yes" ; then
AC_MSG_WARN([Could not compile AVX instructions])
AC_MSG_WARN([Could not compile AVX instructions])
fi
#
fi
fi
fi
if test "${can_compile_avx2}" = "no" ; then
if test "${can_compile_avx2}" = "no" ; then
if test x"${want_avx}" = x"yes" ; then
#
if test x"${want_avx
2
}" = x"yes" ; then
AC_MSG_WARN([Could not compile AVX2 instructions])
AC_MSG_WARN([Could not compile AVX2 instructions])
fi
#
fi
fi
fi
elpa/elpa_kernel_constants.h
View file @
6e86364f
...
@@ -3,11 +3,17 @@
...
@@ -3,11 +3,17 @@
#define ELPA2_REAL_KERNEL_BGP 3
#define ELPA2_REAL_KERNEL_BGP 3
#define ELPA2_REAL_KERNEL_BGQ 4
#define ELPA2_REAL_KERNEL_BGQ 4
#define ELPA2_REAL_KERNEL_SSE 5
#define ELPA2_REAL_KERNEL_SSE 5
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 6
#define ELPA2_REAL_KERNEL_SSE_BLOCK2 6
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 7
#define ELPA2_REAL_KERNEL_SSE_BLOCK4 7
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 8
#define ELPA2_REAL_KERNEL_SSE_BLOCK6 8
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 9
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 10
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 11
#define ELPA2_REAL_KERNEL_AVX2_BLOCK2 12
#define ELPA2_REAL_KERNEL_AVX2_BLOCK4 13
#define ELPA2_REAL_KERNEL_AVX2_BLOCK6 14
#define ELPA2_NUMBER_OF_REAL_KERNELS
8
#define ELPA2_NUMBER_OF_REAL_KERNELS
14
#define ELPA2_COMPLEX_KERNEL_GENERIC 1
#define ELPA2_COMPLEX_KERNEL_GENERIC 1
...
@@ -15,7 +21,12 @@
...
@@ -15,7 +21,12 @@
#define ELPA2_COMPLEX_KERNEL_BGP 3
#define ELPA2_COMPLEX_KERNEL_BGP 3
#define ELPA2_COMPLEX_KERNEL_BGQ 4
#define ELPA2_COMPLEX_KERNEL_BGQ 4
#define ELPA2_COMPLEX_KERNEL_SSE 5
#define ELPA2_COMPLEX_KERNEL_SSE 5
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 6
#define ELPA2_COMPLEX_KERNEL_SSE_BLOCK1 6
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 7
#define ELPA2_COMPLEX_KERNEL_SSE_BLOCK2 7
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 8
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 9
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK1 10
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2 11
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 7
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 11
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp
View file @
6e86364f
...
@@ -59,12 +59,15 @@
...
@@ -59,12 +59,15 @@
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
// --------------------------------------------------------------------------------------------------
#include
"config-f90.h"
#include
<complex>
#include
<complex>
#include
<x86intrin.h>
#include
<x86intrin.h>
#define __forceinline __attribute__((always_inline))
#define __forceinline __attribute__((always_inline))
#ifdef HAVE_AVX2
#ifdef __FMA4__
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
...
@@ -77,6 +80,8 @@
...
@@ -77,6 +80,8 @@
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
#endif
#endif
#endif
extern
"C"
{
extern
"C"
{
//Forward declaration
//Forward declaration
...
...
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp
View file @
6e86364f
...
@@ -59,12 +59,15 @@
...
@@ -59,12 +59,15 @@
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
// --------------------------------------------------------------------------------------------------
#include
"config-f90.h"
#include
<complex>
#include
<complex>
#include
<x86intrin.h>
#include
<x86intrin.h>
#define __forceinline __attribute__((always_inline))
#define __forceinline __attribute__((always_inline))
#ifdef HAVE_AVX2
#ifdef __FMA4__
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
...
@@ -77,6 +80,8 @@
...
@@ -77,6 +80,8 @@
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
#endif
#endif
#endif
extern
"C"
{
extern
"C"
{
//Forward declaration
//Forward declaration
...
...
src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp
0 → 100644
View file @
6e86364f
This diff is collapsed.
Click to expand it.
src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp
0 → 100644
View file @
6e86364f
This diff is collapsed.
Click to expand it.
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c
View file @
6e86364f
...
@@ -60,10 +60,14 @@
...
@@ -60,10 +60,14 @@
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
// --------------------------------------------------------------------------------------------------
#include
"config-f90.h"
#include
<x86intrin.h>
#include
<x86intrin.h>
#define __forceinline __attribute__((always_inline)) static
#define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_AVX2
#ifdef __FMA4__
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
...
@@ -74,6 +78,8 @@
...
@@ -74,6 +78,8 @@
#define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
#define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
#endif
#endif
#endif
//Forward declaration
//Forward declaration
__forceinline
void
hh_trafo_kernel_4_AVX_2hv
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
);
__forceinline
void
hh_trafo_kernel_4_AVX_2hv
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
);
__forceinline
void
hh_trafo_kernel_8_AVX_2hv
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
);
__forceinline
void
hh_trafo_kernel_8_AVX_2hv
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
);
...
...
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c
View file @
6e86364f
...
@@ -59,11 +59,14 @@
...
@@ -59,11 +59,14 @@
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
// --------------------------------------------------------------------------------------------------
#include
"config-f90.h"
#include
<x86intrin.h>
#include
<x86intrin.h>
#define __forceinline __attribute__((always_inline)) static
#define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_AVX2
#ifdef __FMA4__
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
...
@@ -78,6 +81,8 @@
...
@@ -78,6 +81,8 @@
#define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
#define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
#endif
#endif
#endif
//Forward declaration
//Forward declaration
__forceinline
void
hh_trafo_kernel_4_AVX_4hv
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s_1_2
,
double
s_1_3
,
double
s_2_3
,
double
s_1_4
,
double
s_2_4
,
double
s_3_4
);
__forceinline
void
hh_trafo_kernel_4_AVX_4hv
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s_1_2
,
double
s_1_3
,
double
s_2_3
,
double
s_1_4
,
double
s_2_4
,
double
s_3_4
);
__forceinline
void
hh_trafo_kernel_8_AVX_4hv
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s_1_2
,
double
s_1_3
,
double
s_2_3
,
double
s_1_4
,
double
s_2_4
,
double
s_3_4
);
__forceinline
void
hh_trafo_kernel_8_AVX_4hv
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s_1_2
,
double
s_1_3
,
double
s_2_3
,
double
s_1_4
,
double
s_2_4
,
double
s_3_4
);
...
...
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c
View file @
6e86364f
...
@@ -60,10 +60,14 @@
...
@@ -60,10 +60,14 @@
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
// --------------------------------------------------------------------------------------------------
#include
"config-f90.h"
#include
<x86intrin.h>
#include
<x86intrin.h>
#define __forceinline __attribute__((always_inline)) static
#define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_AVX2
#ifdef __FMA4__
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
...
@@ -78,6 +82,8 @@
...
@@ -78,6 +82,8 @@
#define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
#define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
#endif
#endif
#endif
//Forward declaration
//Forward declaration
static
void
hh_trafo_kernel_4_AVX_6hv
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
*
scalarprods
);
static
void
hh_trafo_kernel_4_AVX_6hv
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
*
scalarprods
);
static
void
hh_trafo_kernel_8_AVX_6hv
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
*
scalarprods
);
static
void
hh_trafo_kernel_8_AVX_6hv
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
*
scalarprods
);
...
...
src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c
0 → 100644
View file @
6e86364f
This diff is collapsed.
Click to expand it.
src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c
0 → 100644
View file @
6e86364f
This diff is collapsed.
Click to expand it.
src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c
0 → 100644
View file @
6e86364f
This diff is collapsed.
Click to expand it.
src/elpa2_utilities.F90
View file @
6e86364f
...
@@ -71,13 +71,19 @@ module ELPA2_utilities
...
@@ -71,13 +71,19 @@ module ELPA2_utilities
public
::
get_actual_real_kernel_name
,
get_actual_complex_kernel_name
public
::
get_actual_real_kernel_name
,
get_actual_complex_kernel_name
public
::
REAL_ELPA_KERNEL_GENERIC
,
REAL_ELPA_KERNEL_GENERIC_SIMPLE
,
&
public
::
REAL_ELPA_KERNEL_GENERIC
,
REAL_ELPA_KERNEL_GENERIC_SIMPLE
,
&
REAL_ELPA_KERNEL_BGP
,
REAL_ELPA_KERNEL_BGQ
,
&
REAL_ELPA_KERNEL_BGP
,
REAL_ELPA_KERNEL_BGQ
,
&
REAL_ELPA_KERNEL_SSE
,
REAL_ELPA_KERNEL_AVX_BLOCK2
,
&
REAL_ELPA_KERNEL_SSE
,
REAL_ELPA_KERNEL_SSE_BLOCK2
,
&
REAL_ELPA_KERNEL_AVX_BLOCK4
,
REAL_ELPA_KERNEL_AVX_BLOCK6
REAL_ELPA_KERNEL_SSE_BLOCK4
,
REAL_ELPA_KERNEL_SSE_BLOCK6
,
&
REAL_ELPA_KERNEL_AVX_BLOCK2
,
&
REAL_ELPA_KERNEL_AVX_BLOCK4
,
REAL_ELPA_KERNEL_AVX_BLOCK6
,
&
REAL_ELPA_KERNEL_AVX2_BLOCK2
,
&
REAL_ELPA_KERNEL_AVX2_BLOCK4
,
REAL_ELPA_KERNEL_AVX2_BLOCK6
public
::
COMPLEX_ELPA_KERNEL_GENERIC
,
COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE
,
&
public
::
COMPLEX_ELPA_KERNEL_GENERIC
,
COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE
,
&
COMPLEX_ELPA_KERNEL_BGP
,
COMPLEX_ELPA_KERNEL_BGQ
,
&
COMPLEX_ELPA_KERNEL_BGP
,
COMPLEX_ELPA_KERNEL_BGQ
,
&
COMPLEX_ELPA_KERNEL_SSE
,
COMPLEX_ELPA_KERNEL_AVX_BLOCK1
,
&
COMPLEX_ELPA_KERNEL_SSE
,
COMPLEX_ELPA_KERNEL_SSE_BLOCK1
,
&
COMPLEX_ELPA_KERNEL_AVX_BLOCK2
COMPLEX_ELPA_KERNEL_SSE_BLOCK2
,
&
COMPLEX_ELPA_KERNEL_AVX_BLOCK1
,
COMPLEX_ELPA_KERNEL_AVX_BLOCK2
,
&
COMPLEX_ELPA_KERNEL_AVX2_BLOCK1
,
COMPLEX_ELPA_KERNEL_AVX2_BLOCK2
public
::
REAL_ELPA_KERNEL_NAMES
,
COMPLEX_ELPA_KERNEL_NAMES
public
::
REAL_ELPA_KERNEL_NAMES
,
COMPLEX_ELPA_KERNEL_NAMES
...
@@ -97,9 +103,15 @@ module ELPA2_utilities
...
@@ -97,9 +103,15 @@ module ELPA2_utilities
integer
,
parameter
::
REAL_ELPA_KERNEL_BGP
=
ELPA2_REAL_KERNEL_BGP
integer
,
parameter
::
REAL_ELPA_KERNEL_BGP
=
ELPA2_REAL_KERNEL_BGP
integer
,
parameter
::
REAL_ELPA_KERNEL_BGQ
=
ELPA2_REAL_KERNEL_BGQ
integer
,
parameter
::
REAL_ELPA_KERNEL_BGQ
=
ELPA2_REAL_KERNEL_BGQ
integer
,
parameter
::
REAL_ELPA_KERNEL_SSE
=
ELPA2_REAL_KERNEL_SSE
integer
,
parameter
::
REAL_ELPA_KERNEL_SSE
=
ELPA2_REAL_KERNEL_SSE
integer
,
parameter
::
REAL_ELPA_KERNEL_SSE_BLOCK2
=
ELPA2_REAL_KERNEL_SSE_BLOCK2
integer
,
parameter
::
REAL_ELPA_KERNEL_SSE_BLOCK4
=
ELPA2_REAL_KERNEL_SSE_BLOCK4
integer
,
parameter
::
REAL_ELPA_KERNEL_SSE_BLOCK6
=
ELPA2_REAL_KERNEL_SSE_BLOCK6
integer
,
parameter
::
REAL_ELPA_KERNEL_AVX_BLOCK2
=
ELPA2_REAL_KERNEL_AVX_BLOCK2
integer
,
parameter
::
REAL_ELPA_KERNEL_AVX_BLOCK2
=
ELPA2_REAL_KERNEL_AVX_BLOCK2
integer
,
parameter
::
REAL_ELPA_KERNEL_AVX_BLOCK4
=
ELPA2_REAL_KERNEL_AVX_BLOCK4
integer
,
parameter
::
REAL_ELPA_KERNEL_AVX_BLOCK4
=
ELPA2_REAL_KERNEL_AVX_BLOCK4
integer
,
parameter
::
REAL_ELPA_KERNEL_AVX_BLOCK6
=
ELPA2_REAL_KERNEL_AVX_BLOCK6
integer
,
parameter
::
REAL_ELPA_KERNEL_AVX_BLOCK6
=
ELPA2_REAL_KERNEL_AVX_BLOCK6
integer
,
parameter
::
REAL_ELPA_KERNEL_AVX2_BLOCK2
=
ELPA2_REAL_KERNEL_AVX2_BLOCK2
integer
,
parameter
::
REAL_ELPA_KERNEL_AVX2_BLOCK4
=
ELPA2_REAL_KERNEL_AVX2_BLOCK4
integer
,
parameter
::
REAL_ELPA_KERNEL_AVX2_BLOCK6
=
ELPA2_REAL_KERNEL_AVX2_BLOCK6