Commit e6767067 authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'master_pre_stage' into 'master'

Master pre stage

See merge request !52
parents 5bff9356 de9b8cfb
......@@ -2,13 +2,16 @@ Changelog for next release
- not yet decided
Changelog for ELPA 2020.11.001.rc1
Changelog for ELPA 2020.11.001
- this release containts mostly bugfixes:
- fix determination whether a _ is needed to link Fortran to C
- fix an error in the real block4 kernel for arch64 NEON
- add missing test_scalapack_template.F90 to EXTRA_DIST list
- fix error in the GPU kernel
- switch form python2 to python3
- experimental feature: complex kernels for arch64 NEON
- experimental feature: kernels for ARM SVE
Changelog for ELPA 2020.05.001
......
......@@ -2,7 +2,7 @@
## Preamble ##
This file provides documentation on how to build the *ELPA* library in **version ELPA-2020.11.001.rc1**.
This file provides documentation on how to build the *ELPA* library in **version ELPA-2020.11.001**.
With release of **version ELPA-2017.05.001** the build process has been significantly simplified,
which makes it easier to install the *ELPA* library.
......@@ -10,7 +10,7 @@ The release ELPA 2018.11.001 was the last release, where the legacy API has been
enabled by default (and can be disabled at build time).
With the release ELPA 2019.11.001, the legacy API has been deprecated and the support has been closed.
The release of ELPA 2020.11.001.rc1 does change the API and ABI compared to the release 2019.11.001, since
The release of ELPA 2020.11.001 does change the API and ABI compared to the release 2019.11.001, since
the legacy API has been dropped.
## How to install *ELPA* ##
......@@ -59,6 +59,9 @@ An excerpt of the most important (*ELPA* specific) options reads as follows:
| `--disable-avx` | do not build AVX kernels, default: enabled |
| `--disable-avx2` | do not build AVX2 kernels, default: enabled |
| `--enable-avx512` | build AVX512 kernels, default: disabled |
| `--enable-sve128` | Experimental feature build ARM SVE128 kernels, default: disabled |
| `--enable-sve256` | Experimental feature build ARM SVE256 kernels, default: disabled |
| `--enable-sve512` | Experimental feature build ARM SVE512 kernels, default: disabled |
| `--enable-gpu` | build GPU kernels, default: disabled |
| `--enable-bgp` | build BGP kernels, default: disabled |
| `--enable-bgq` | build BGQ kernels, default: disabled |
......
......@@ -3,9 +3,11 @@
For more details and recent updates please visit the online [issue system](https://gitlab.mpcdf.mpg.de/elpa/elpa/issues)
Issues which are not mentioned in a newer release are (considered as) solved.
### ELPA 2020.11.001.rc1 release ###
### ELPA 2020.11.001 release ###
- fixes a problem with GPU kernels
- fixes a problem with VSX kernels
- fixes a problem with NEON kernels
- do not use MPI_COMM_WORLD in check_gpu function
- add missing test_scalapack_template.F90 to EXTRA_DIST list
### ELPA 2020.05.001 release ###
......
......@@ -247,6 +247,27 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
if WITH_REAL_SVE128_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve128_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve128_2hv_single_precision.c
endif
endif
if WITH_REAL_SVE256_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve256_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve256_2hv_single_precision.c
endif
endif
if WITH_REAL_SVE512_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve512_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve512_2hv_single_precision.c
endif
endif
if WITH_REAL_SPARC64_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
......@@ -296,6 +317,27 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
if WITH_REAL_SVE128_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve128_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve128_4hv_single_precision.c
endif
endif
if WITH_REAL_SVE256_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve256_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve256_4hv_single_precision.c
endif
endif
if WITH_REAL_SVE512_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve512_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve512_4hv_single_precision.c
endif
endif
if WITH_REAL_SPARC64_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
......@@ -345,6 +387,27 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
if WITH_REAL_SVE128_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve128_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve128_6hv_single_precision.c
endif
endif
if WITH_REAL_SVE256_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve256_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve256_6hv_single_precision.c
endif
endif
if WITH_REAL_SVE512_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve512_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sve512_6hv_single_precision.c
endif
endif
#if WITH_COMPLEX_SPARC64_BLOCK1_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_1hv_double_precision.c
#if WANT_SINGLE_PRECISION_COMPLEX
......@@ -366,6 +429,13 @@ if WANT_SINGLE_PRECISION_COMPLEX
endif
endif
if WITH_COMPLEX_NEON_ARCH64_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_neon_arch64_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_neon_arch64_1hv_single_precision.c
endif
endif
if WITH_COMPLEX_AVX_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
......@@ -380,7 +450,6 @@ if WANT_SINGLE_PRECISION_COMPLEX
endif
endif
if WITH_COMPLEX_AVX512_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx512_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
......@@ -388,6 +457,27 @@ if WANT_SINGLE_PRECISION_COMPLEX
endif
endif
if WITH_COMPLEX_SVE128_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve128_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve128_1hv_single_precision.c
endif
endif
if WITH_COMPLEX_SVE256_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve256_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve256_1hv_single_precision.c
endif
endif
if WITH_COMPLEX_SVE512_BLOCK1_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve512_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve512_1hv_single_precision.c
endif
endif
#if WITH_COMPLEX_SPARC64_BLOCK2_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_2hv_double_precision.c
#if WANT_SINGLE_PRECISION_COMPLEX
......@@ -409,6 +499,13 @@ if WANT_SINGLE_PRECISION_COMPLEX
endif
endif
if WITH_COMPLEX_NEON_ARCH64_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_neon_arch64_2hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_neon_arch64_2hv_single_precision.c
endif
endif
if WITH_COMPLEX_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx_2hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
......@@ -430,6 +527,27 @@ if WANT_SINGLE_PRECISION_COMPLEX
endif
endif
if WITH_COMPLEX_SVE128_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve128_2hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve128_2hv_single_precision.c
endif
endif
if WITH_COMPLEX_SVE256_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve256_2hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve256_2hv_single_precision.c
endif
endif
if WITH_COMPLEX_SVE512_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve512_2hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sve512_2hv_single_precision.c
endif
endif
if STORE_BUILD_CONFIG
libelpa@SUFFIX@_private_la_SOURCES += src/helpers/print_build_config.c
......
......@@ -2,7 +2,7 @@
## Current Release ##
The current release is ELPA 2020.11.001.rc1 The current supported API version
The current release is ELPA 2020.11.001 The current supported API version
is 20190501. This release supports the earliest API version 20170403.
The release ELPA 2018.11.001 was the last release, where the legacy API has been
......@@ -130,7 +130,7 @@ the possible configure options.
## Using *ELPA*
Please have a look at the [USERS_GUIDE](USERS_GUIDE.md) file, to get a documentation or at the [online](http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2020.11.001.rc1/html/index.html) doxygen documentation, where you find the definition of the interfaces.
Please have a look at the [USERS_GUIDE](USERS_GUIDE.md) file, to get a documentation or at the [online](http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2020.11.001/html/index.html) doxygen documentation, where you find the definition of the interfaces.
## Contributing to *ELPA*
......
This file contains the release notes for the ELPA 2020.11.001.rc1 version
This file contains the release notes for the ELPA 2020.11.001 version
What is new?
-------------
For detailed information about changes since release ELPA 2020.05.001 please have a look at the Changelog file
- bugfixes
- experimental feature: support for ARM SVE
- experimental feature: complex kernels for arch64 NEON
ABI change
......
......@@ -146,7 +146,7 @@ Local documentation (via man pages) should be available (if *ELPA* has been inst
For example `man elpa2_print_kernels` should provide the documentation for the *ELPA* program which prints all
the available kernels.
Also a [online doxygen documentation](http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2020.11.001.rc1/html/index.html)
Also a [online doxygen documentation](http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2020.11.001/html/index.html)
for each *ELPA* release is available.
......@@ -13,7 +13,7 @@ Local documentation (via man pages) should be available (if *ELPA* has been inst
For example `man elpa2_print_kernels` should provide the documentation for the *ELPA* program, which prints all
the available kernels.
Also a [online doxygen documentation](http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2020.11.001.rc1/html/index.html)
Also a [online doxygen documentation](http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2020.11.001/html/index.html)
for each *ELPA* release is available.
......@@ -180,7 +180,7 @@ The following table gives a list of all supported parameters which can be used t
## III) List of computational routines ##
The following compute routines are available in *ELPA*: Please have a look at the man pages or [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2020.11.001.rc1/html/index.html) for details.
The following compute routines are available in *ELPA*: Please have a look at the man pages or [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2020.11.001/html/index.html) for details.
| Name | Purpose | since API version |
......
......@@ -22,7 +22,7 @@ The *ELPA* library consists of two main parts:
Both variants of the *ELPA* solvers are available for real or complex singe and double precision valued matrices.
Thus *ELPA* provides the following user functions (see man pages or [online](http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2020.11.001.rc1/html/index.html) for details):
Thus *ELPA* provides the following user functions (see man pages or [online](http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2020.11.001/html/index.html) for details):
- elpa_get_communicators : set the row / column communicators for *ELPA*
- elpa_solve_evp_complex_1stage_{single|double} : solve a {single|double} precision complex eigenvalue proplem with the *ELPA 1stage* solver
......
......@@ -46,7 +46,7 @@ AC_DEFINE([EARLIEST_AUTOTUNE_VERSION], [20171201], [Earliest ELPA API version, w
AC_DEFINE([CURRENT_AUTOTUNE_VERSION], [20200417], [Current ELPA autotune version])
AC_DEFINE_SUBST(CURRENT_AUTOTUNE_VERSION, 20200417, "Current ELPA autotune version")
AC_DEFINE_UNQUOTED([ELPA_BUILDTIME], [$ELPA_BUILDTIME], ["Time of build"])
AX_COMPARE_VERSION([$ELPA_BUILDTIME], [gt], [1605657599],[old_elpa_version=yes],[old_elpa_version=no])
AX_COMPARE_VERSION([$ELPA_BUILDTIME], [gt], [1623715200],[old_elpa_version=yes],[old_elpa_version=no])
AX_CHECK_GNU_MAKE()
if test x$_cv_gnu_make_command = x ; then
......@@ -793,6 +793,8 @@ m4_define(elpa_m4_neon_arch64_kernels, [
real_neon_arch64_block2
real_neon_arch64_block4
real_neon_arch64_block6
complex_neon_arch64_block1
complex_neon_arch64_block2
])
m4_define(elpa_m4_vsx_kernels, [
......@@ -801,6 +803,14 @@ m4_define(elpa_m4_vsx_kernels, [
real_vsx_block6
])
m4_define(elpa_m4_sve128_kernels, [
real_sve128_block2
real_sve128_block4
real_sve128_block6
complex_sve128_block1
complex_sve128_block2
])
m4_define(elpa_m4_avx_kernels, [
real_avx_block2
real_avx_block4
......@@ -817,6 +827,14 @@ m4_define(elpa_m4_avx2_kernels, [
complex_avx2_block2
])
m4_define(elpa_m4_sve256_kernels, [
real_sve256_block2
real_sve256_block4
real_sve256_block6
complex_sve256_block1
complex_sve256_block2
])
m4_define(elpa_m4_avx512_kernels, [
real_avx512_block2
real_avx512_block4
......@@ -825,6 +843,14 @@ m4_define(elpa_m4_avx512_kernels, [
complex_avx512_block2
])
m4_define(elpa_m4_sve512_kernels, [
real_sve512_block2
real_sve512_block4
real_sve512_block6
complex_sve512_block1
complex_sve512_block2
])
m4_define(elpa_m4_bgp_kernels, [
real_bgp
complex_bgp
......@@ -840,7 +866,7 @@ m4_define(elpa_m4_gpu_kernels, [
complex_gpu
])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 bgp bgq gpu])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq gpu])
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
......@@ -881,6 +907,9 @@ ELPA_SELECT_KERNELS([sse_assembly],[enable])
ELPA_SELECT_KERNELS([avx],[enable])
ELPA_SELECT_KERNELS([avx2],[enable])
ELPA_SELECT_KERNELS([avx512],[enable])
ELPA_SELECT_KERNELS([sve128],[disable])
ELPA_SELECT_KERNELS([sve256],[disable])
ELPA_SELECT_KERNELS([sve512],[disable])
ELPA_SELECT_KERNELS([gpu],[disable])
ELPA_SELECT_KERNELS([bgp],[disable])
ELPA_SELECT_KERNELS([bgq],[disable])
......@@ -890,7 +919,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
])
if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels, [
if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then
echo "Disabling elpa_m4_kernel due to BGP/BGQ option"
fi
......@@ -950,7 +979,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[
])
fi
])
m4_foreach_w([elpa_m4_arch],[sparc64 neon_arch64 vsx sse avx avx2 avx512],[
m4_foreach_w([elpa_m4_arch],[sparc64 neon_arch64 vsx sse avx avx2 avx512 sve128 sve256 sve512],[
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1])
......@@ -1008,7 +1037,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_gpu_kernels,
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_gpu_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[
if test -z "$default_[]elpa_m4_kind[]_kernel"; then
......@@ -1233,6 +1262,69 @@ if test x"${need_avx2}" = x"yes"; then
AC_DEFINE([HAVE_AVX2],[1],[AVX2 is supported on this CPU])
fi
if test x"${need_sve128}" = x"yes"; then
AC_MSG_CHECKING([whether we can compile SVE128 gcc intrinsics in C])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <arm_sve.h>
int main(int argc, char **argv){
double *q;
svfloat64_t q1 = svld1_f64(svptrue_b64(), q);
svfloat64_t y1 = svmad_f64_z(svptrue_b64(), q1, q1, q1);
return 0;
}
])],
[can_compile_sve128=yes],
[can_compile_sve128=no]
)
AC_MSG_RESULT([${can_compile_sve128}])
if test x"$can_compile_sve128" != x"yes"; then
AC_MSG_ERROR([Could not compile a test program with SVE128, adjust the C compiler or CFLAGS. Possibly (some of) the flags " $SIMD_FLAGS " solve this issue])
fi
AC_DEFINE([HAVE_SVE128],[1],[SVE128 is supported on this CPU])
fi
if test x"${need_sve256}" = x"yes"; then
AC_MSG_CHECKING([whether we can compile SVE256 gcc intrinsics in C])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <arm_sve.h>
int main(int argc, char **argv){
double *q;
svfloat64_t q1 = svld1_f64(svptrue_b64(), q);
svfloat64_t y1 = svmad_f64_z(svptrue_b64(), q1, q1, q1);
return 0;
}
])],
[can_compile_sve256=yes],
[can_compile_sve256=no]
)
AC_MSG_RESULT([${can_compile_sve256}])
if test x"$can_compile_sve256" != x"yes"; then
AC_MSG_ERROR([Could not compile a test program with SVE256, adjust the C compiler or CFLAGS. Possibly (some of) the flags " $SIMD_FLAGS " solve this issue])
fi
AC_DEFINE([HAVE_SVE256],[1],[SVE256 is supported on this CPU])
fi
if test x"${need_sve512}" = x"yes"; then
AC_MSG_CHECKING([whether we can compile SVE512 gcc intrinsics in C])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <arm_sve.h>
int main(int argc, char **argv){
double *q;
svfloat64_t q1 = svld1_f64(svptrue_b64(), q);
svfloat64_t y1 = svmad_f64_z(svptrue_b64(), q1, q1, q1);
return 0;
}
])],
[can_compile_sve512=yes],
[can_compile_sve512=no]
)
AC_MSG_RESULT([${can_compile_sve512}])
if test x"$can_compile_sve512" != x"yes"; then
AC_MSG_ERROR([Could not compile a test program with SVE512, adjust the C compiler or CFLAGS. Possibly (some of) the flags " $SIMD_FLAGS " solve this issue])
fi
AC_DEFINE([HAVE_SVE512],[1],[SVE512 is supported on this CPU])
fi
if test x"${need_avx512}" = x"yes"; then
AC_MSG_CHECKING([whether we can compile AVX512 gcc intrinsics in C])
......@@ -1960,13 +2052,13 @@ if test x"${ax_cv_have_avx512f_cpu_ext}" = x"yes" -a x"${need_avx512}" = x"no";
echo " "
fi
echo " "
echo "***********************************************************************"
echo "* This is the first release candidate of ELPA 2020.11.001.rc1 *"
echo "* There might be still some changes until the final release of *"
echo "* ELPA 2020.11.001 *"
echo "***********************************************************************"
echo " "
#echo " "
#echo "***********************************************************************"
#echo "* This is the first release candidate of ELPA 2020.11.001.rc1 *"
#echo "* There might be still some changes until the final release of *"
#echo "* ELPA 2020.11.001 *"
#echo "***********************************************************************"
#echo " "
if test x"$enable_kcomputer" = x"yes" ; then
echo " "
......
......@@ -19,7 +19,7 @@
%define with_openmp 0
Name: elpa
Version: 2020.11.001.rc1
Version: 2020.11.001
Release: 1
Summary: A massively parallel eigenvector solver
License: LGPL-3.0
......
......@@ -60,8 +60,17 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 25, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 26, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 27, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 28, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 29, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_REAL_SVE128_BLOCK2, 28, @ELPA_2STAGE_REAL_SVE128_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK4, 29, @ELPA_2STAGE_REAL_SVE128_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK6, 30, @ELPA_2STAGE_REAL_SVE128_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK2, 31, @ELPA_2STAGE_REAL_SVE256_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK4, 32, @ELPA_2STAGE_REAL_SVE256_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK6, 33, @ELPA_2STAGE_REAL_SVE256_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK2, 34, @ELPA_2STAGE_REAL_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK4, 35, @ELPA_2STAGE_REAL_SVE512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK6, 36, @ELPA_2STAGE_REAL_SVE512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 37, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 38, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
......@@ -87,7 +96,15 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2, 11, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1, 12, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2, 13, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_SVE128_BLOCK1, 14, @ELPA_2STAGE_COMPLEX_SVE128_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE128_BLOCK2, 15, @ELPA_2STAGE_COMPLEX_SVE128_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE256_BLOCK1, 16, @ELPA_2STAGE_COMPLEX_SVE256_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE256_BLOCK2, 17, @ELPA_2STAGE_COMPLEX_SVE256_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK1, 18, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK2, 19, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1, 20, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2, 21, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 22, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
......
......@@ -9,5 +9,8 @@
#define VSX_INSTR 9
#define ARCH64_INSTR 10
#define SPARC_INSTR 11
#define SVE128_INSTR 12
#define SVE256_INSTR 13
#define SVE512_INSTR 14
#define NUMBER_OF_INSTR 12
#define NUMBER_OF_INSTR 15
......@@ -48,18 +48,21 @@ module mod_check_for_gpu
contains
function check_for_gpu(myid, numberOfDevices, wantDebug) result(gpuAvailable)
function check_for_gpu(obj, myid, numberOfDevices, wantDebug) result(gpuAvailable)
use cuda_functions
use precision
use elpa_mpi
use elpa_abstract_impl
implicit none
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik), intent(in) :: myid
logical, optional, intent(in) :: wantDebug
logical :: success, wantDebugMessage
integer(kind=ik), intent(out) :: numberOfDevices
integer(kind=ik) :: deviceNumber, mpierr, maxNumberOfDevices
logical :: gpuAvailable
integer(kind=ik) :: error, mpi_comm_all
!character(len=1024) :: envname
if (.not.(present(wantDebug))) then
......@@ -74,6 +77,12 @@ module mod_check_for_gpu
gpuAvailable = .false.
call obj%get("mpi_comm_parent",mpi_comm_all,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for mpi_comm_parent. Aborting..."
stop
endif
if (cublasHandle .ne. -1) then
gpuAvailable = .true.
numberOfDevices = -1
......@@ -98,7 +107,7 @@ module mod_check_for_gpu
! make sure that all nodes have the same number of GPU's, otherwise
! we run into loadbalancing trouble
#ifdef WITH_MPI
call mpi_allreduce(numberOfDevices, maxNumberOfDevices, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, mpierr)
call mpi_allreduce(numberOfDevices, maxNumberOfDevices, 1, MPI_INTEGER, MPI_MAX, mpi_comm_all, mpierr)
if (maxNumberOfDevices .ne. numberOfDevices) then
print *,"Different number of GPU devices on MPI tasks!"
......
......@@ -329,7 +329,7 @@ function elpa_solve_evp_&
if (useGPU) then
call obj%timer%start("check_for_gpu")
if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
if (check_for_gpu(obj, my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then
do_useGPU = .true.
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
......
......@@ -176,7 +176,7 @@
if (useGPU) then
call obj%timer%start("check_for_gpu")
if (check_for_gpu(myid,numGPU)) then
if (check_for_gpu(obj, myid, numGPU)) then
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
......
This diff is collapsed.
......@@ -360,7 +360,7 @@
do_useGPU = .false.
if (useGPU) then
call obj%timer%start("check_for_gpu")
if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
if (check_for_gpu(obj, my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then
do_useGPU = .true.
......
......@@ -319,7 +319,11 @@ subroutine trans_ev_tridi_to_band_&
#ifdef DOUBLE_PRECISION_REAL
if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6 .or. &
kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK2 .or. &