Unverified Commit 6e86364f authored by Lorenz Hüdepohl's avatar Lorenz Hüdepohl
Browse files

Merge branch 'master' of git@gitlab.mpcdf.mpg.de:elpa/elpa.git

parents 59e405e0 ebc097eb
...@@ -80,31 +80,46 @@ if WITH_COMPLEX_SSE_KERNEL ...@@ -80,31 +80,46 @@ if WITH_COMPLEX_SSE_KERNEL
endif endif
endif endif
if WITH_REAL_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c
endif
if WITH_REAL_AVX_BLOCK2_KERNEL if WITH_REAL_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c
endif endif
if WITH_REAL_SSE_BLOCK4_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c
endif
if WITH_REAL_AVX_BLOCK4_KERNEL if WITH_REAL_AVX_BLOCK4_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c
endif endif
if WITH_REAL_SSE_BLOCK6_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c
endif
if WITH_REAL_AVX_BLOCK6_KERNEL if WITH_REAL_AVX_BLOCK6_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c
endif endif
if WITH_COMPLEX_SSE_BLOCK1_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp
endif
if WITH_COMPLEX_AVX_BLOCK1_KERNEL if WITH_COMPLEX_AVX_BLOCK1_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp
endif endif
if WITH_COMPLEX_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp
endif
if WITH_COMPLEX_AVX_BLOCK2_KERNEL if WITH_COMPLEX_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp
endif endif
#if WITH_AVX_SANDYBRIDGE
# libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c \
# src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp
#endif
# install any .mod files in the include/ dir # install any .mod files in the include/ dir
elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@ elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@
nobase_elpa_include_HEADERS = $(wildcard modules/*) nobase_elpa_include_HEADERS = $(wildcard modules/*)
......
...@@ -143,6 +143,10 @@ install_real_generic_simple=yes ...@@ -143,6 +143,10 @@ install_real_generic_simple=yes
install_complex_generic=yes install_complex_generic=yes
install_complex_generic_simple=yes install_complex_generic_simple=yes
#want_avx=yes
#want_avx2=yes
#want_sse=yes
AC_LANG([C]) AC_LANG([C])
dnl build with ftimings support dnl build with ftimings support
...@@ -196,12 +200,26 @@ $CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s -o test.o 2>/dev/nul ...@@ -196,12 +200,26 @@ $CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s -o test.o 2>/dev/nul
if test "$?" == 0; then if test "$?" == 0; then
can_compile_sse=yes can_compile_sse=yes
install_real_sse=yes install_real_sse=yes
install_real_sse_block2=yes
install_real_sse_block4=yes
install_real_sse_block6=yes
install_complex_sse=yes install_complex_sse=yes
install_complex_sse_block1=yes
install_complex_sse_block2=yes
else else
can_compile_sse=no can_compile_sse=no
install_real_sse=no install_real_sse=no
install_real_sse_block2=no
install_real_sse_block4=no
install_real_sse_block6=no
install_complex_sse=no install_complex_sse=no
install_complex_sse_block1=no
install_complex_sse_block2=no
fi fi
rm -f ./test.o rm -f ./test.o
AC_MSG_RESULT([${can_compile_sse}]) AC_MSG_RESULT([${can_compile_sse}])
...@@ -286,8 +304,6 @@ if test "${can_compile_avx}" = "yes" ; then ...@@ -286,8 +304,6 @@ if test "${can_compile_avx}" = "yes" ; then
install_complex_avx_block1=yes install_complex_avx_block1=yes
install_complex_avx_block2=yes install_complex_avx_block2=yes
want_avx=yes
else else
install_real_avx_block2=no install_real_avx_block2=no
install_real_avx_block4=no install_real_avx_block4=no
...@@ -295,10 +311,23 @@ else ...@@ -295,10 +311,23 @@ else
install_complex_avx_block1=no install_complex_avx_block1=no
install_complex_avx_block2=no install_complex_avx_block2=no
want_avx=yes
fi fi
if test "${can_compile_avx2}" = "yes" ; then
install_real_avx2_block2=yes
install_real_avx2_block4=yes
install_real_avx2_block6=yes
install_complex_avx2_block1=yes
install_complex_avx2_block2=yes
else
install_real_avx2_block2=no
install_real_avx2_block4=no
install_real_avx2_block6=no
install_complex_avx2_block1=no
install_complex_avx2_block2=no
fi
AM_CONDITIONAL([HAVE_SSE],[test x"$can_compile_sse" = x"yes"]) AM_CONDITIONAL([HAVE_SSE],[test x"$can_compile_sse" = x"yes"])
if test x"${can_compile_sse}" = x"yes" ; then if test x"${can_compile_sse}" = x"yes" ; then
AC_DEFINE([HAVE_SSE],[1],[SSE is supported on this CPU]) AC_DEFINE([HAVE_SSE],[1],[SSE is supported on this CPU])
...@@ -477,7 +506,6 @@ AC_MSG_CHECKING([whether we can use the intrinsic Fortran function "get_environm ...@@ -477,7 +506,6 @@ AC_MSG_CHECKING([whether we can use the intrinsic Fortran function "get_environm
AC_COMPILE_IFELSE([AC_LANG_SOURCE([ AC_COMPILE_IFELSE([AC_LANG_SOURCE([
program test_get_environment program test_get_environment
character(len=256) :: homedir character(len=256) :: homedir
call get_environment_variable("HOME",homedir) call get_environment_variable("HOME",homedir)
end program end program
...@@ -570,6 +598,15 @@ dnl real kernels ...@@ -570,6 +598,15 @@ dnl real kernels
dnl bgq kernel dnl bgq kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgq-kernel-only],[bgq-kernel],[install_real_bgq]) DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgq-kernel-only],[bgq-kernel],[install_real_bgq])
dnl real-sse-block2 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block2-kernel-only],[real-sse-block2-kernel],[install_real_sse_block2])
dnl real-sse-block4 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block4-kernel]-only,[real-sse-block4-kernel],[install_real_sse_block4])
dnl real-sse-block6 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block6-kernel-only],[real-sse-block6-kernel],[install_real_sse_block6])
dnl real-avx-block2 kernel dnl real-avx-block2 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block2-kernel-only],[real-avx-block2-kernel],[install_real_avx_block2]) DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block2-kernel-only],[real-avx-block2-kernel],[install_real_avx_block2])
...@@ -600,6 +637,12 @@ dnl complex kernels ...@@ -600,6 +637,12 @@ dnl complex kernels
dnl complex-bqq kernel dnl complex-bqq kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgq-kernel-only],[bgq-kernel],[install_complex_bgq]) DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgq-kernel-only],[bgq-kernel],[install_complex_bgq])
dnl complex-sse-block1 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-block1-kernel-only],[complex-sse-block1-kernel],[install_complex_sse_block1])
dnl complex-avx-block2 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-block2-kernel-only],[complex-sse-block2-kernel],[install_complex_sse_block2])
dnl complex-avx-block1 kernel dnl complex-avx-block1 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block1-kernel-only],[complex-avx-block1-kernel],[install_complex_avx_block1]) DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block1-kernel-only],[complex-avx-block1-kernel],[install_complex_avx_block1])
...@@ -607,6 +650,7 @@ dnl complex kernels ...@@ -607,6 +650,7 @@ dnl complex kernels
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block2-kernel-only],[complex-avx-block2-kernel],[install_complex_avx_block2]) DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block2-kernel-only],[complex-avx-block2-kernel],[install_complex_avx_block2])
dnl set the conditionals according to the previous tests dnl set the conditionals according to the previous tests
if test x"${can_use_iso_fortran_env}" = x"yes" ; then if test x"${can_use_iso_fortran_env}" = x"yes" ; then
AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env]) AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env])
fi fi
...@@ -641,6 +685,21 @@ if test x"${install_complex_sse}" = x"yes" ; then ...@@ -641,6 +685,21 @@ if test x"${install_complex_sse}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_KERNEL],[1],[can use complex SSE kernel]) AC_DEFINE([WITH_COMPLEX_SSE_KERNEL],[1],[can use complex SSE kernel])
fi fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK2_KERNEL],[test x"$install_real_sse_block2" = x"yes"])
if test x"${install_real_sse_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_BLOCK2_KERNEL],[1],[can use real_sse_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK4_KERNEL],[test x"$install_real_sse_block4" = x"yes"])
if test x"${install_real_sse_block4}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_BLOCK4_KERNEL],[1],[can use real_sse_block4 kernel])
fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK6_KERNEL],[test x"$install_real_sse_block6" = x"yes"])
if test x"${install_real_sse_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_BLOCK6_KERNEL],[1],[can use real_sse_block6 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX_BLOCK2_KERNEL],[test x"$install_real_avx_block2" = x"yes"]) AM_CONDITIONAL([WITH_REAL_AVX_BLOCK2_KERNEL],[test x"$install_real_avx_block2" = x"yes"])
if test x"${install_real_avx_block2}" = x"yes" ; then if test x"${install_real_avx_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX_BLOCK2_KERNEL],[1],[can use real_avx_block2 kernel]) AC_DEFINE([WITH_REAL_AVX_BLOCK2_KERNEL],[1],[can use real_avx_block2 kernel])
...@@ -656,6 +715,31 @@ if test x"${install_real_avx_block6}" = x"yes" ; then ...@@ -656,6 +715,31 @@ if test x"${install_real_avx_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX_BLOCK6_KERNEL],[1],[can use real_avx_block6 kernel]) AC_DEFINE([WITH_REAL_AVX_BLOCK6_KERNEL],[1],[can use real_avx_block6 kernel])
fi fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK2_KERNEL],[test x"$install_real_avx2_block2" = x"yes"])
if test x"${install_real_avx2_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK2_KERNEL],[1],[can use real_avx2_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK4_KERNEL],[test x"$install_real_avx2_block4" = x"yes"])
if test x"${install_real_avx2_block4}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK4_KERNEL],[1],[can use real_avx2_block4 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK6_KERNEL],[test x"$install_real_avx2_block6" = x"yes"])
if test x"${install_real_avx2_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK6_KERNEL],[1],[can use real_avx2_block6 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[test x"$install_complex_sse_block1" = x"yes"])
if test x"${install_complex_sse_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[1],[can use complex_sse_block1 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK2_KERNEL],[test x"$install_complex_sse_block2" = x"yes"])
if test x"${install_complex_sse_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_BLOCK2_KERNEL],[1],[can use complex_sse_block2 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[test x"$install_complex_avx_block1" = x"yes"]) AM_CONDITIONAL([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[test x"$install_complex_avx_block1" = x"yes"])
if test x"${install_complex_avx_block1}" = x"yes" ; then if test x"${install_complex_avx_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[1],[can use complex_avx_block1 kernel]) AC_DEFINE([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[1],[can use complex_avx_block1 kernel])
...@@ -666,6 +750,16 @@ if test x"${install_complex_avx_block2}" = x"yes" ; then ...@@ -666,6 +750,16 @@ if test x"${install_complex_avx_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX_BLOCK2_KERNEL],[1],[can use complex_avx_block2 kernel]) AC_DEFINE([WITH_COMPLEX_AVX_BLOCK2_KERNEL],[1],[can use complex_avx_block2 kernel])
fi fi
AM_CONDITIONAL([WITH_COMPLEX_AVX2_BLOCK1_KERNEL],[test x"$install_complex_avx2_block1" = x"yes"])
if test x"${install_complex_avx2_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK1_KERNEL],[1],[can use complex_avx2_block1 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[test x"$install_complex_avx2_block2" = x"yes"])
if test x"${install_complex_avx2_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[1],[can use complex_avx2_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_BGP_KERNEL],[test x"$install_real_bgp" = x"yes"]) AM_CONDITIONAL([WITH_REAL_BGP_KERNEL],[test x"$install_real_bgp" = x"yes"])
if test x"${install_real_bgp}" = x"yes" ; then if test x"${install_real_bgp}" = x"yes" ; then
AC_DEFINE([WITH_REAL_BGP_KERNEL],[1],[can use real BGP kernel]) AC_DEFINE([WITH_REAL_BGP_KERNEL],[1],[can use real BGP kernel])
...@@ -763,13 +857,13 @@ mkdir -p test/shared_sources ...@@ -763,13 +857,13 @@ mkdir -p test/shared_sources
grep -h "^ *!c>" $srcdir/test/shared_sources/*.F90 | sed 's/^ *!c>//;' > test/shared_sources/generated.h || exit 1 grep -h "^ *!c>" $srcdir/test/shared_sources/*.F90 | sed 's/^ *!c>//;' > test/shared_sources/generated.h || exit 1
if test "${can_compile_avx}" = "no" ; then if test "${can_compile_avx}" = "no" ; then
if test x"${want_avx}" = x"yes" ; then # if test x"${want_avx}" = x"yes" ; then
AC_MSG_WARN([Could not compile AVX instructions]) AC_MSG_WARN([Could not compile AVX instructions])
fi # fi
fi fi
if test "${can_compile_avx2}" = "no" ; then if test "${can_compile_avx2}" = "no" ; then
if test x"${want_avx}" = x"yes" ; then # if test x"${want_avx2}" = x"yes" ; then
AC_MSG_WARN([Could not compile AVX2 instructions]) AC_MSG_WARN([Could not compile AVX2 instructions])
fi # fi
fi fi
...@@ -3,11 +3,17 @@ ...@@ -3,11 +3,17 @@
#define ELPA2_REAL_KERNEL_BGP 3 #define ELPA2_REAL_KERNEL_BGP 3
#define ELPA2_REAL_KERNEL_BGQ 4 #define ELPA2_REAL_KERNEL_BGQ 4
#define ELPA2_REAL_KERNEL_SSE 5 #define ELPA2_REAL_KERNEL_SSE 5
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 6 #define ELPA2_REAL_KERNEL_SSE_BLOCK2 6
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 7 #define ELPA2_REAL_KERNEL_SSE_BLOCK4 7
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 8 #define ELPA2_REAL_KERNEL_SSE_BLOCK6 8
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 9
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 10
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 11
#define ELPA2_REAL_KERNEL_AVX2_BLOCK2 12
#define ELPA2_REAL_KERNEL_AVX2_BLOCK4 13
#define ELPA2_REAL_KERNEL_AVX2_BLOCK6 14
#define ELPA2_NUMBER_OF_REAL_KERNELS 8 #define ELPA2_NUMBER_OF_REAL_KERNELS 14
#define ELPA2_COMPLEX_KERNEL_GENERIC 1 #define ELPA2_COMPLEX_KERNEL_GENERIC 1
...@@ -15,7 +21,12 @@ ...@@ -15,7 +21,12 @@
#define ELPA2_COMPLEX_KERNEL_BGP 3 #define ELPA2_COMPLEX_KERNEL_BGP 3
#define ELPA2_COMPLEX_KERNEL_BGQ 4 #define ELPA2_COMPLEX_KERNEL_BGQ 4
#define ELPA2_COMPLEX_KERNEL_SSE 5 #define ELPA2_COMPLEX_KERNEL_SSE 5
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 6 #define ELPA2_COMPLEX_KERNEL_SSE_BLOCK1 6
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 7 #define ELPA2_COMPLEX_KERNEL_SSE_BLOCK2 7
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 8
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 9
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK1 10
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2 11
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 7
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 11
...@@ -59,12 +59,15 @@ ...@@ -59,12 +59,15 @@
// Author: Alexander Heinecke (alexander.heinecke@mytum.de) // Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// -------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <complex> #include <complex>
#include <x86intrin.h> #include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) #define __forceinline __attribute__((always_inline))
#ifdef HAVE_AVX2
#ifdef __FMA4__ #ifdef __FMA4__
#define __ELPA_USE_FMA__ #define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c) #define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
...@@ -77,6 +80,8 @@ ...@@ -77,6 +80,8 @@
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c) #define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
#endif #endif
#endif
extern "C" { extern "C" {
//Forward declaration //Forward declaration
......
...@@ -59,12 +59,15 @@ ...@@ -59,12 +59,15 @@
// Author: Alexander Heinecke (alexander.heinecke@mytum.de) // Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// -------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <complex> #include <complex>
#include <x86intrin.h> #include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) #define __forceinline __attribute__((always_inline))
#ifdef HAVE_AVX2
#ifdef __FMA4__ #ifdef __FMA4__
#define __ELPA_USE_FMA__ #define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c) #define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
...@@ -77,6 +80,8 @@ ...@@ -77,6 +80,8 @@
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c) #define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
#endif #endif
#endif
extern "C" { extern "C" {
//Forward declaration //Forward declaration
......
This diff is collapsed.
This diff is collapsed.
...@@ -60,10 +60,14 @@ ...@@ -60,10 +60,14 @@
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// -------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <x86intrin.h> #include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) static #define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_AVX2
#ifdef __FMA4__ #ifdef __FMA4__
#define __ELPA_USE_FMA__ #define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c) #define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
...@@ -74,6 +78,8 @@ ...@@ -74,6 +78,8 @@
#define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c) #define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
#endif #endif
#endif
//Forward declaration //Forward declaration
__forceinline void hh_trafo_kernel_4_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); __forceinline void hh_trafo_kernel_4_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_8_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); __forceinline void hh_trafo_kernel_8_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
......
...@@ -59,11 +59,14 @@ ...@@ -59,11 +59,14 @@
// Author: Alexander Heinecke (alexander.heinecke@mytum.de) // Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// -------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <x86intrin.h> #include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) static #define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_AVX2
#ifdef __FMA4__ #ifdef __FMA4__
#define __ELPA_USE_FMA__ #define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c) #define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
...@@ -78,6 +81,8 @@ ...@@ -78,6 +81,8 @@
#define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c) #define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
#endif #endif
#endif
//Forward declaration //Forward declaration
__forceinline void hh_trafo_kernel_4_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); __forceinline void hh_trafo_kernel_4_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
__forceinline void hh_trafo_kernel_8_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); __forceinline void hh_trafo_kernel_8_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
......
...@@ -60,10 +60,14 @@ ...@@ -60,10 +60,14 @@
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// -------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <x86intrin.h> #include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) static #define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_AVX2
#ifdef __FMA4__ #ifdef __FMA4__
#define __ELPA_USE_FMA__ #define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c) #define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
...@@ -78,6 +82,8 @@ ...@@ -78,6 +82,8 @@
#define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c) #define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
#endif #endif
#endif
//Forward declaration //Forward declaration
static void hh_trafo_kernel_4_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); static void hh_trafo_kernel_4_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
static void hh_trafo_kernel_8_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); static void hh_trafo_kernel_8_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -71,13 +71,19 @@ module ELPA2_utilities ...@@ -71,13 +71,19 @@ module ELPA2_utilities
public :: get_actual_real_kernel_name, get_actual_complex_kernel_name public :: get_actual_real_kernel_name, get_actual_complex_kernel_name
public :: REAL_ELPA_KERNEL_GENERIC, REAL_ELPA_KERNEL_GENERIC_SIMPLE, & public :: REAL_ELPA_KERNEL_GENERIC, REAL_ELPA_KERNEL_GENERIC_SIMPLE, &
REAL_ELPA_KERNEL_BGP, REAL_ELPA_KERNEL_BGQ, & REAL_ELPA_KERNEL_BGP, REAL_ELPA_KERNEL_BGQ, &
REAL_ELPA_KERNEL_SSE, REAL_ELPA_KERNEL_AVX_BLOCK2, & REAL_ELPA_KERNEL_SSE, REAL_ELPA_KERNEL_SSE_BLOCK2, &
REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6 REAL_ELPA_KERNEL_SSE_BLOCK4, REAL_ELPA_KERNEL_SSE_BLOCK6, &
REAL_ELPA_KERNEL_AVX_BLOCK2, &
REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6, &
REAL_ELPA_KERNEL_AVX2_BLOCK2, &
REAL_ELPA_KERNEL_AVX2_BLOCK4, REAL_ELPA_KERNEL_AVX2_BLOCK6
public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, & public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, &
COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ, & COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ, &
COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_AVX_BLOCK1, & COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_SSE_BLOCK1, &
COMPLEX_ELPA_KERNEL_AVX_BLOCK2 COMPLEX_ELPA_KERNEL_SSE_BLOCK2, &
COMPLEX_ELPA_KERNEL_AVX_BLOCK1,COMPLEX_ELPA_KERNEL_AVX_BLOCK2, &
COMPLEX_ELPA_KERNEL_AVX2_BLOCK1,COMPLEX_ELPA_KERNEL_AVX2_BLOCK2
public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES
...@@ -97,9 +103,15 @@ module ELPA2_utilities ...@@ -97,9 +103,15 @@ module ELPA2_utilities
integer, parameter :: REAL_ELPA_KERNEL_BGP = ELPA2_REAL_KERNEL_BGP integer, parameter :: REAL_ELPA_KERNEL_BGP = ELPA2_REAL_KERNEL_BGP
integer, parameter :: REAL_ELPA_KERNEL_BGQ = ELPA2_REAL_KERNEL_BGQ integer, parameter :: REAL_ELPA_KERNEL_BGQ = ELPA2_REAL_KERNEL_BGQ
integer, parameter :: REAL_ELPA_KERNEL_SSE = ELPA2_REAL_KERNEL_SSE integer, parameter :: REAL_ELPA_KERNEL_SSE = ELPA2_REAL_KERNEL_SSE
integer, parameter :: REAL_ELPA_KERNEL_SSE_BLOCK2 = ELPA2_REAL_KERNEL_SSE_BLOCK2
integer, parameter :: REAL_ELPA_KERNEL_SSE_BLOCK4 = ELPA2_REAL_KERNEL_SSE_BLOCK4
integer, parameter :: REAL_ELPA_KERNEL_SSE_BLOCK6 = ELPA2_REAL_KERNEL_SSE_BLOCK6
integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_REAL_KERNEL_AVX_BLOCK2 integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_REAL_KERNEL_AVX_BLOCK2
integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK4 = ELPA2_REAL_KERNEL_AVX_BLOCK4 integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK4 = ELPA2_REAL_KERNEL_AVX_BLOCK4
integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK6 = ELPA2_REAL_KERNEL_AVX_BLOCK6 integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK6 = ELPA2_REAL_KERNEL_AVX_BLOCK6
integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK2 = ELPA2_REAL_KERNEL_AVX2_BLOCK2
integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK4 = ELPA2_REAL_KERNEL_AVX2_BLOCK4
integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK6 = ELPA2_REAL_KERNEL_AVX2_BLOCK6