Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
elpa
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
11
Issues
11
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Environments
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
elpa
elpa
Commits
6e86364f
Unverified
Commit
6e86364f
authored
Apr 08, 2016
by
Lorenz Hüdepohl
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'master' of git@gitlab.mpcdf.mpg.de:elpa/elpa.git
parents
59e405e0
ebc097eb
Changes
18
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
6380 additions
and
49 deletions
+6380
-49
Makefile.am
Makefile.am
+20
-5
configure.ac
configure.ac
+103
-9
elpa/elpa_kernel_constants.h
elpa/elpa_kernel_constants.h
+18
-7
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp
+5
-0
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp
+5
-0
src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp
src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp
+588
-0
src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp
src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp
+1465
-0
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c
+6
-0
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c
+5
-0
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c
+6
-0
src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c
src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c
+849
-0
src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c
src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c
+1302
-0
src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c
src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c
+1729
-0
src/elpa2_utilities.F90
src/elpa2_utilities.F90
+92
-11
src/mod_compute_hh_trafo_complex.F90
src/mod_compute_hh_trafo_complex.F90
+54
-5
src/mod_compute_hh_trafo_real.F90
src/mod_compute_hh_trafo_real.F90
+126
-6
test/fortran_test_programs/elpa_test_programs_print_headers.X90
...ortran_test_programs/elpa_test_programs_print_headers.X90
+6
-5
test/shared_sources/read_input_parameters.F90
test/shared_sources/read_input_parameters.F90
+1
-1
No files found.
Makefile.am
View file @
6e86364f
...
...
@@ -80,31 +80,46 @@ if WITH_COMPLEX_SSE_KERNEL
endif
endif
if
WITH_REAL_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c
endif
if
WITH_REAL_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c
endif
if
WITH_REAL_SSE_BLOCK4_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c
endif
if
WITH_REAL_AVX_BLOCK4_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c
endif
if
WITH_REAL_SSE_BLOCK6_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c
endif
if
WITH_REAL_AVX_BLOCK6_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c
endif
if
WITH_COMPLEX_SSE_BLOCK1_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp
endif
if
WITH_COMPLEX_AVX_BLOCK1_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp
endif
if
WITH_COMPLEX_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp
endif
if
WITH_COMPLEX_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp
endif
#if WITH_AVX_SANDYBRIDGE
# libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c \
# src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp
#endif
# install any .mod files in the include/ dir
elpa_includedir
=
$(includedir)
/elpa@SUFFIX@-@PACKAGE_VERSION@
nobase_elpa_include_HEADERS
=
$(
wildcard
modules/
*
)
...
...
configure.ac
View file @
6e86364f
...
...
@@ -143,6 +143,10 @@ install_real_generic_simple=yes
install_complex_generic=yes
install_complex_generic_simple=yes
#want_avx=yes
#want_avx2=yes
#want_sse=yes
AC_LANG([C])
dnl build with ftimings support
...
...
@@ -196,12 +200,26 @@ $CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s -o test.o 2>/dev/nul
if test "$?" == 0; then
can_compile_sse=yes
install_real_sse=yes
install_real_sse_block2=yes
install_real_sse_block4=yes
install_real_sse_block6=yes
install_complex_sse=yes
install_complex_sse_block1=yes
install_complex_sse_block2=yes
else
can_compile_sse=no
install_real_sse=no
install_real_sse_block2=no
install_real_sse_block4=no
install_real_sse_block6=no
install_complex_sse=no
install_complex_sse_block1=no
install_complex_sse_block2=no
fi
rm -f ./test.o
AC_MSG_RESULT([${can_compile_sse}])
...
...
@@ -286,8 +304,6 @@ if test "${can_compile_avx}" = "yes" ; then
install_complex_avx_block1=yes
install_complex_avx_block2=yes
want_avx=yes
else
install_real_avx_block2=no
install_real_avx_block4=no
...
...
@@ -295,10 +311,23 @@ else
install_complex_avx_block1=no
install_complex_avx_block2=no
want_avx=yes
fi
if test "${can_compile_avx2}" = "yes" ; then
install_real_avx2_block2=yes
install_real_avx2_block4=yes
install_real_avx2_block6=yes
install_complex_avx2_block1=yes
install_complex_avx2_block2=yes
else
install_real_avx2_block2=no
install_real_avx2_block4=no
install_real_avx2_block6=no
install_complex_avx2_block1=no
install_complex_avx2_block2=no
fi
AM_CONDITIONAL([HAVE_SSE],[test x"$can_compile_sse" = x"yes"])
if test x"${can_compile_sse}" = x"yes" ; then
AC_DEFINE([HAVE_SSE],[1],[SSE is supported on this CPU])
...
...
@@ -477,7 +506,6 @@ AC_MSG_CHECKING([whether we can use the intrinsic Fortran function "get_environm
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
program test_get_environment
character(len=256) :: homedir
call get_environment_variable("HOME",homedir)
end program
...
...
@@ -570,6 +598,15 @@ dnl real kernels
dnl bgq kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgq-kernel-only],[bgq-kernel],[install_real_bgq])
dnl real-sse-block2 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block2-kernel-only],[real-sse-block2-kernel],[install_real_sse_block2])
dnl real-sse-block4 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block4-kernel]-only,[real-sse-block4-kernel],[install_real_sse_block4])
dnl real-sse-block6 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block6-kernel-only],[real-sse-block6-kernel],[install_real_sse_block6])
dnl real-avx-block2 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block2-kernel-only],[real-avx-block2-kernel],[install_real_avx_block2])
...
...
@@ -600,6 +637,12 @@ dnl complex kernels
dnl complex-bqq kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgq-kernel-only],[bgq-kernel],[install_complex_bgq])
dnl complex-sse-block1 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-block1-kernel-only],[complex-sse-block1-kernel],[install_complex_sse_block1])
dnl complex-avx-block2 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-block2-kernel-only],[complex-sse-block2-kernel],[install_complex_sse_block2])
dnl complex-avx-block1 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block1-kernel-only],[complex-avx-block1-kernel],[install_complex_avx_block1])
...
...
@@ -607,6 +650,7 @@ dnl complex kernels
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block2-kernel-only],[complex-avx-block2-kernel],[install_complex_avx_block2])
dnl set the conditionals according to the previous tests
if test x"${can_use_iso_fortran_env}" = x"yes" ; then
AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env])
fi
...
...
@@ -641,6 +685,21 @@ if test x"${install_complex_sse}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_KERNEL],[1],[can use complex SSE kernel])
fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK2_KERNEL],[test x"$install_real_sse_block2" = x"yes"])
if test x"${install_real_sse_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_BLOCK2_KERNEL],[1],[can use real_sse_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK4_KERNEL],[test x"$install_real_sse_block4" = x"yes"])
if test x"${install_real_sse_block4}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_BLOCK4_KERNEL],[1],[can use real_sse_block4 kernel])
fi
AM_CONDITIONAL([WITH_REAL_SSE_BLOCK6_KERNEL],[test x"$install_real_sse_block6" = x"yes"])
if test x"${install_real_sse_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_SSE_BLOCK6_KERNEL],[1],[can use real_sse_block6 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX_BLOCK2_KERNEL],[test x"$install_real_avx_block2" = x"yes"])
if test x"${install_real_avx_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX_BLOCK2_KERNEL],[1],[can use real_avx_block2 kernel])
...
...
@@ -656,6 +715,31 @@ if test x"${install_real_avx_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX_BLOCK6_KERNEL],[1],[can use real_avx_block6 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK2_KERNEL],[test x"$install_real_avx2_block2" = x"yes"])
if test x"${install_real_avx2_block2}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK2_KERNEL],[1],[can use real_avx2_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK4_KERNEL],[test x"$install_real_avx2_block4" = x"yes"])
if test x"${install_real_avx2_block4}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK4_KERNEL],[1],[can use real_avx2_block4 kernel])
fi
AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK6_KERNEL],[test x"$install_real_avx2_block6" = x"yes"])
if test x"${install_real_avx2_block6}" = x"yes" ; then
AC_DEFINE([WITH_REAL_AVX2_BLOCK6_KERNEL],[1],[can use real_avx2_block6 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[test x"$install_complex_sse_block1" = x"yes"])
if test x"${install_complex_sse_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[1],[can use complex_sse_block1 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK2_KERNEL],[test x"$install_complex_sse_block2" = x"yes"])
if test x"${install_complex_sse_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_SSE_BLOCK2_KERNEL],[1],[can use complex_sse_block2 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[test x"$install_complex_avx_block1" = x"yes"])
if test x"${install_complex_avx_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[1],[can use complex_avx_block1 kernel])
...
...
@@ -666,6 +750,16 @@ if test x"${install_complex_avx_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX_BLOCK2_KERNEL],[1],[can use complex_avx_block2 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX2_BLOCK1_KERNEL],[test x"$install_complex_avx2_block1" = x"yes"])
if test x"${install_complex_avx2_block1}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK1_KERNEL],[1],[can use complex_avx2_block1 kernel])
fi
AM_CONDITIONAL([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[test x"$install_complex_avx2_block2" = x"yes"])
if test x"${install_complex_avx2_block2}" = x"yes" ; then
AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[1],[can use complex_avx2_block2 kernel])
fi
AM_CONDITIONAL([WITH_REAL_BGP_KERNEL],[test x"$install_real_bgp" = x"yes"])
if test x"${install_real_bgp}" = x"yes" ; then
AC_DEFINE([WITH_REAL_BGP_KERNEL],[1],[can use real BGP kernel])
...
...
@@ -763,13 +857,13 @@ mkdir -p test/shared_sources
grep -h "^ *!c>" $srcdir/test/shared_sources/*.F90 | sed 's/^ *!c>//;' > test/shared_sources/generated.h || exit 1
if test "${can_compile_avx}" = "no" ; then
if test x"${want_avx}" = x"yes" ; then
#
if test x"${want_avx}" = x"yes" ; then
AC_MSG_WARN([Could not compile AVX instructions])
fi
#
fi
fi
if test "${can_compile_avx2}" = "no" ; then
if test x"${want_avx
}" = x"yes" ; then
# if test x"${want_avx2
}" = x"yes" ; then
AC_MSG_WARN([Could not compile AVX2 instructions])
fi
#
fi
fi
elpa/elpa_kernel_constants.h
View file @
6e86364f
...
...
@@ -3,11 +3,17 @@
#define ELPA2_REAL_KERNEL_BGP 3
#define ELPA2_REAL_KERNEL_BGQ 4
#define ELPA2_REAL_KERNEL_SSE 5
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 6
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 7
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 8
#define ELPA2_REAL_KERNEL_SSE_BLOCK2 6
#define ELPA2_REAL_KERNEL_SSE_BLOCK4 7
#define ELPA2_REAL_KERNEL_SSE_BLOCK6 8
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 9
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 10
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 11
#define ELPA2_REAL_KERNEL_AVX2_BLOCK2 12
#define ELPA2_REAL_KERNEL_AVX2_BLOCK4 13
#define ELPA2_REAL_KERNEL_AVX2_BLOCK6 14
#define ELPA2_NUMBER_OF_REAL_KERNELS
8
#define ELPA2_NUMBER_OF_REAL_KERNELS
14
#define ELPA2_COMPLEX_KERNEL_GENERIC 1
...
...
@@ -15,7 +21,12 @@
#define ELPA2_COMPLEX_KERNEL_BGP 3
#define ELPA2_COMPLEX_KERNEL_BGQ 4
#define ELPA2_COMPLEX_KERNEL_SSE 5
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 6
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 7
#define ELPA2_COMPLEX_KERNEL_SSE_BLOCK1 6
#define ELPA2_COMPLEX_KERNEL_SSE_BLOCK2 7
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 8
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 9
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK1 10
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2 11
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 7
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 11
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp
View file @
6e86364f
...
...
@@ -59,12 +59,15 @@
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <complex>
#include <x86intrin.h>
#define __forceinline __attribute__((always_inline))
#ifdef HAVE_AVX2
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
...
...
@@ -77,6 +80,8 @@
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
#endif
#endif
extern
"C"
{
//Forward declaration
...
...
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp
View file @
6e86364f
...
...
@@ -59,12 +59,15 @@
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <complex>
#include <x86intrin.h>
#define __forceinline __attribute__((always_inline))
#ifdef HAVE_AVX2
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
...
...
@@ -77,6 +80,8 @@
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
#endif
#endif
extern
"C"
{
//Forward declaration
...
...
src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp
0 → 100644
View file @
6e86364f
This diff is collapsed.
Click to expand it.
src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp
0 → 100644
View file @
6e86364f
This diff is collapsed.
Click to expand it.
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c
View file @
6e86364f
...
...
@@ -60,10 +60,14 @@
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_AVX2
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
...
...
@@ -74,6 +78,8 @@
#define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
#endif
#endif
//Forward declaration
__forceinline
void
hh_trafo_kernel_4_AVX_2hv
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
);
__forceinline
void
hh_trafo_kernel_8_AVX_2hv
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s
);
...
...
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c
View file @
6e86364f
...
...
@@ -59,11 +59,14 @@
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_AVX2
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
...
...
@@ -78,6 +81,8 @@
#define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
#endif
#endif
//Forward declaration
__forceinline
void
hh_trafo_kernel_4_AVX_4hv
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s_1_2
,
double
s_1_3
,
double
s_2_3
,
double
s_1_4
,
double
s_2_4
,
double
s_3_4
);
__forceinline
void
hh_trafo_kernel_8_AVX_4hv
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
s_1_2
,
double
s_1_3
,
double
s_2_3
,
double
s_1_4
,
double
s_2_4
,
double
s_3_4
);
...
...
src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c
View file @
6e86364f
...
...
@@ -60,10 +60,14 @@
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) static
#ifdef HAVE_AVX2
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
...
...
@@ -78,6 +82,8 @@
#define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
#endif
#endif
//Forward declaration
static
void
hh_trafo_kernel_4_AVX_6hv
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
*
scalarprods
);
static
void
hh_trafo_kernel_8_AVX_6hv
(
double
*
q
,
double
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
*
scalarprods
);
...
...
src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c
0 → 100644
View file @
6e86364f
This diff is collapsed.
Click to expand it.
src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c
0 → 100644
View file @
6e86364f
This diff is collapsed.
Click to expand it.
src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c
0 → 100644
View file @
6e86364f
This diff is collapsed.
Click to expand it.
src/elpa2_utilities.F90
View file @
6e86364f
...
...
@@ -71,13 +71,19 @@ module ELPA2_utilities
public
::
get_actual_real_kernel_name
,
get_actual_complex_kernel_name
public
::
REAL_ELPA_KERNEL_GENERIC
,
REAL_ELPA_KERNEL_GENERIC_SIMPLE
,
&
REAL_ELPA_KERNEL_BGP
,
REAL_ELPA_KERNEL_BGQ
,
&
REAL_ELPA_KERNEL_SSE
,
REAL_ELPA_KERNEL_AVX_BLOCK2
,
&
REAL_ELPA_KERNEL_AVX_BLOCK4
,
REAL_ELPA_KERNEL_AVX_BLOCK6
REAL_ELPA_KERNEL_SSE
,
REAL_ELPA_KERNEL_SSE_BLOCK2
,
&
REAL_ELPA_KERNEL_SSE_BLOCK4
,
REAL_ELPA_KERNEL_SSE_BLOCK6
,
&
REAL_ELPA_KERNEL_AVX_BLOCK2
,
&
REAL_ELPA_KERNEL_AVX_BLOCK4
,
REAL_ELPA_KERNEL_AVX_BLOCK6
,
&
REAL_ELPA_KERNEL_AVX2_BLOCK2
,
&
REAL_ELPA_KERNEL_AVX2_BLOCK4
,
REAL_ELPA_KERNEL_AVX2_BLOCK6
public
::
COMPLEX_ELPA_KERNEL_GENERIC
,
COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE
,
&
COMPLEX_ELPA_KERNEL_BGP
,
COMPLEX_ELPA_KERNEL_BGQ
,
&
COMPLEX_ELPA_KERNEL_SSE
,
COMPLEX_ELPA_KERNEL_AVX_BLOCK1
,
&
COMPLEX_ELPA_KERNEL_AVX_BLOCK2
COMPLEX_ELPA_KERNEL_SSE
,
COMPLEX_ELPA_KERNEL_SSE_BLOCK1
,
&
COMPLEX_ELPA_KERNEL_SSE_BLOCK2
,
&
COMPLEX_ELPA_KERNEL_AVX_BLOCK1
,
COMPLEX_ELPA_KERNEL_AVX_BLOCK2
,
&
COMPLEX_ELPA_KERNEL_AVX2_BLOCK1
,
COMPLEX_ELPA_KERNEL_AVX2_BLOCK2
public
::
REAL_ELPA_KERNEL_NAMES
,
COMPLEX_ELPA_KERNEL_NAMES
...
...
@@ -97,9 +103,15 @@ module ELPA2_utilities
integer
,
parameter
::
REAL_ELPA_KERNEL_BGP
=
ELPA2_REAL_KERNEL_BGP
integer
,
parameter
::
REAL_ELPA_KERNEL_BGQ
=
ELPA2_REAL_KERNEL_BGQ
integer
,
parameter
::
REAL_ELPA_KERNEL_SSE
=
ELPA2_REAL_KERNEL_SSE
integer
,
parameter
::
REAL_ELPA_KERNEL_SSE_BLOCK2
=
ELPA2_REAL_KERNEL_SSE_BLOCK2
integer
,
parameter
::
REAL_ELPA_KERNEL_SSE_BLOCK4
=
ELPA2_REAL_KERNEL_SSE_BLOCK4
integer
,
parameter
::
REAL_ELPA_KERNEL_SSE_BLOCK6
=
ELPA2_REAL_KERNEL_SSE_BLOCK6
integer
,
parameter
::
REAL_ELPA_KERNEL_AVX_BLOCK2
=
ELPA2_REAL_KERNEL_AVX_BLOCK2
integer
,
parameter
::
REAL_ELPA_KERNEL_AVX_BLOCK4
=
ELPA2_REAL_KERNEL_AVX_BLOCK4
integer
,
parameter
::
REAL_ELPA_KERNEL_AVX_BLOCK6
=
ELPA2_REAL_KERNEL_AVX_BLOCK6
integer
,
parameter
::
REAL_ELPA_KERNEL_AVX2_BLOCK2
=
ELPA2_REAL_KERNEL_AVX2_BLOCK2
integer
,
parameter
::
REAL_ELPA_KERNEL_AVX2_BLOCK4
=
ELPA2_REAL_KERNEL_AVX2_BLOCK4
integer
,
parameter
::
REAL_ELPA_KERNEL_AVX2_BLOCK6
=
ELPA2_REAL_KERNEL_AVX2_BLOCK6
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
integer
,
parameter
::
DEFAULT_REAL_ELPA_KERNEL
=
REAL_ELPA_KERNEL_GENERIC
...
...
@@ -112,9 +124,15 @@ module ELPA2_utilities
"REAL_ELPA_KERNEL_BGP "
,
&
"REAL_ELPA_KERNEL_BGQ "
,
&
"REAL_ELPA_KERNEL_SSE "
,
&
"REAL_ELPA_KERNEL_SSE_BLOCK2 "
,
&
"REAL_ELPA_KERNEL_SSE_BLOCK4 "
,
&
"REAL_ELPA_KERNEL_SSE_BLOCK6 "
,
&
"REAL_ELPA_KERNEL_AVX_BLOCK2 "
,
&
"REAL_ELPA_KERNEL_AVX_BLOCK4 "
,
&
"REAL_ELPA_KERNEL_AVX_BLOCK6 "
/)
"REAL_ELPA_KERNEL_AVX_BLOCK6 "
,
&
"REAL_ELPA_KERNEL_AVX2_BLOCK2 "
,
&
"REAL_ELPA_KERNEL_AVX2_BLOCK4 "
,
&
"REAL_ELPA_KERNEL_AVX2_BLOCK6 "
/)
integer
,
parameter
::
number_of_complex_kernels
=
ELPA2_NUMBER_OF_COMPLEX_KERNELS
integer
,
parameter
::
COMPLEX_ELPA_KERNEL_GENERIC
=
ELPA2_COMPLEX_KERNEL_GENERIC
...
...
@@ -122,8 +140,12 @@ module ELPA2_utilities
integer
,
parameter
::
COMPLEX_ELPA_KERNEL_BGP
=
ELPA2_COMPLEX_KERNEL_BGP
integer
,
parameter
::
COMPLEX_ELPA_KERNEL_BGQ
=
ELPA2_COMPLEX_KERNEL_BGQ
integer
,
parameter
::
COMPLEX_ELPA_KERNEL_SSE
=
ELPA2_COMPLEX_KERNEL_SSE
integer
,
parameter
::
COMPLEX_ELPA_KERNEL_SSE_BLOCK1
=
ELPA2_COMPLEX_KERNEL_SSE_BLOCK1
integer
,
parameter
::
COMPLEX_ELPA_KERNEL_SSE_BLOCK2
=
ELPA2_COMPLEX_KERNEL_SSE_BLOCK2
integer
,
parameter
::
COMPLEX_ELPA_KERNEL_AVX_BLOCK1
=
ELPA2_COMPLEX_KERNEL_AVX_BLOCK1
integer
,
parameter
::
COMPLEX_ELPA_KERNEL_AVX_BLOCK2
=
ELPA2_COMPLEX_KERNEL_AVX_BLOCK2
integer
,
parameter
::
COMPLEX_ELPA_KERNEL_AVX2_BLOCK1
=
ELPA2_COMPLEX_KERNEL_AVX2_BLOCK1
integer
,
parameter
::
COMPLEX_ELPA_KERNEL_AVX2_BLOCK2
=
ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
integer
,
parameter
::
DEFAULT_COMPLEX_ELPA_KERNEL
=
COMPLEX_ELPA_KERNEL_GENERIC
...
...
@@ -136,8 +158,12 @@ module ELPA2_utilities
"COMPLEX_ELPA_KERNEL_BGP "
,
&
"COMPLEX_ELPA_KERNEL_BGQ "
,
&
"COMPLEX_ELPA_KERNEL_SSE "
,
&
"COMPLEX_ELPA_KERNEL_SSE_BLOCK1 "
,
&
"COMPLEX_ELPA_KERNEL_SSE_BLOCK2 "
,
&
"COMPLEX_ELPA_KERNEL_AVX_BLOCK1 "
,
&
"COMPLEX_ELPA_KERNEL_AVX_BLOCK2 "
/)
"COMPLEX_ELPA_KERNEL_AVX_BLOCK2 "
,
&
"COMPLEX_ELPA_KERNEL_AVX2_BLOCK1 "
,
&
"COMPLEX_ELPA_KERNEL_AVX2_BLOCK2 "
/)
integer
,
parameter
::
&
AVAILABLE_REAL_ELPA_KERNELS
(
number_of_real_kernels
)
=
&
...
...
@@ -167,21 +193,54 @@ module ELPA2_utilities
#else
,
0
&
#endif
#if WITH_REAL_
AVX
_BLOCK2_KERNEL
#if WITH_REAL_
SSE
_BLOCK2_KERNEL
,
1
&
#else
,
0
&
#endif
#if WITH_REAL_
AVX
_BLOCK4_KERNEL
#if WITH_REAL_
SSE
_BLOCK4_KERNEL
,
1
&
#else
,
0
&
#endif
#if WITH_REAL_
AVX
_BLOCK6_KERNEL
#if WITH_REAL_
SSE
_BLOCK6_KERNEL
,
1
&
#else
,
0
&
#endif
#if WITH_REAL_AVX_BLOCK2_KERNEL
,
1
&
#else
,
0
&
#endif
#if WITH_REAL_AVX_BLOCK4_KERNEL
,
1
&
#else
,
0
&
#endif
#if WITH_REAL_AVX_BLOCK6_KERNEL
,
1
&
#else
,
0
&
#endif
#if WITH_REAL_AVX2_BLOCK2_KERNEL
,
1
&
#else
,
0
&
#endif
#if WITH_REAL_AVX2_BLOCK4_KERNEL
,
1
&
#else
,
0
&
#endif
#if WITH_REAL_AVX2_BLOCK6_KERNEL
,
1
&
#else
,
0
&
#endif
/)
integer
,
parameter
::
&
...
...
@@ -212,16 +271,38 @@ module ELPA2_utilities
#else
,
0
&
#endif
#if WITH_COMPLEX_
AVX
_BLOCK1_KERNEL
#if WITH_COMPLEX_
SSE
_BLOCK1_KERNEL
,
1
&
#else
,
0
&
#endif
#if WITH_COMPLEX_
AVX
_BLOCK2_KERNEL
#if WITH_COMPLEX_
SSE
_BLOCK2_KERNEL
,
1
&
#else
,
0
&
#endif
#if WITH_COMPLEX_AVX_BLOCK1_KERNEL
,
1
&
#else
,
0
&
#endif
#if WITH_COMPLEX_AVX_BLOCK2_KERNEL
,
1
&
#else
,
0
&
#endif
#if WITH_COMPLEX_AVX2_BLOCK1_KERNEL
,
1
&
#else
,
0
&
#endif
#if WITH_COMPLEX_AVX2_BLOCK2_KERNEL
,
1
&
#else
,
0
&
#endif
/)
!******
...
...
src/mod_compute_hh_trafo_complex.F90
View file @
6e86364f
...
...
@@ -90,9 +90,38 @@ module compute_hh_trafo_complex
nl
=
merge
(
stripe_width
,
last_stripe_width
,
istripe
<
stripe_count
)
#endif
#if defined(WITH_COMPLEX_
AVX
_BLOCK2_KERNEL)
#if defined(WITH_COMPLEX_
SSE
_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if
(
THIS_COMPLEX_ELPA_KERNEL
.eq.
COMPLEX_ELPA_KERNEL_AVX_BLOCK2
)
then
if
(
THIS_COMPLEX_ELPA_KERNEL
.eq.
COMPLEX_ELPA_KERNEL_SSE_BLOCK2
)
then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt
=
mpi_wtime
()
do
j
=
ncols
,
2
,
-2
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
#ifdef WITH_OPENMP
call
double_hh_trafo_complex_sse_2hv
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
),
&
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#else
call
double_hh_trafo_complex_sse_2hv
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
),
&
w
,
nbw
,
nl
,
stripe_width
,
nbw
)
#endif
enddo
#ifdef WITH_OPENMP
if
(
j
==
1
)
call
single_hh_trafo_complex_sse_1hv
(
a
(
1
,
1
+
off
+
a_off
,
istripe
,
my_thread
),
&
bcast_buffer
(
1
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#else
if
(
j
==
1
)
call
single_hh_trafo_complex_sse_1hv
(
a
(
1
,
1
+
off
+
a_off
,
istripe
),
&
bcast_buffer
(
1
,
off
+1
),
nbw
,
nl
,
stripe_width
)
#endif
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */
#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if
(
(
THIS_COMPLEX_ELPA_KERNEL
.eq.
COMPLEX_ELPA_KERNEL_AVX_BLOCK2
)
.or.
&
(
THIS_COMPLEX_ELPA_KERNEL
.eq.
COMPLEX_ELPA_KERNEL_AVX2_BLOCK2
)
)
then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt
=
mpi_wtime
()
do
j
=
ncols
,
2
,
-2
...
...
@@ -213,9 +242,29 @@ module compute_hh_trafo_complex
! call single_hh_trafo_complex_avx_avx2_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#endif
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if
(
THIS_COMPLEX_ELPA_KERNEL
.eq.
COMPLEX_ELPA_KERNEL_SSE_BLOCK1
)
then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt
=
mpi_wtime
()