Commit 7ac01bdd authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'master_pre_stage' into 'master'

modify max_stored_rows

See merge request !70
parents 377ac3b8 7854c078
......@@ -2,7 +2,7 @@ Changelog for next release
- not yet decided
Changelog for ELPA 2021.05.001.rc
Changelog for ELPA 2021.05.001.rc1
- allow the user to set the mapping of MPI tasks to GPU id per set/get
- experimental feature: port to AMD GPUS, works correctly, performance yet
......@@ -10,6 +10,18 @@ Changelog for ELPA 2021.05.001.rc
- On request, ELPA can print the pinning of MPI tasks and OpenMP thread
- support for FUGAKU: some minor fix still have to be fixed due to compiler
issues
- BUG FIX: if matrix is already banded, check whether bandwidth >= 2. DO NOT
ALLOW a bandwidth = 1, since this would imply that the input matrix is
already diagonal which the ELPA algorithms do not support
- BUG FIX in internal test programs: do not consider a residual of 0.0 to be
an error
- support for skew-symmetric matrices now enabled by default
- BUG FIX in generalized case: in setups like "mpiexec -np 4 ./validate_real_double_generalized_1stage_random 90 90 45`
- ELPA_SETUPS does now (in case of MPI-runs) check whether the user-provided BLACSGRID is reasonable (i.e. ELPA does
_not_ rely anymore that the user does check prior to calling ELPA whether the BLACSGRID is ok) if this check fails
then ELPA returns with an error
- limit number of OpenMP threads to one, if MPI thread level is not at least MPI_THREAD_SERIALIZED
- allow checking of the supported threading level of the MPI library at build time
Changelog for ELPA 2020.11.001
......
......@@ -130,7 +130,7 @@ the possible configure options.
## Using *ELPA*
Please have a look at the [USERS_GUIDE](USERS_GUIDE.md) file, to get a documentation or at the [online](http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2021.05.001.rc1/html/index.html) doxygen documentation, where you find the definition of the interfaces. You might want to have a look at the [PERFORMANCE tuning document](./documentation/PERFORMANCE_TUNING.md) to avoid some usual pitfalls.
Please have a look at the [USERS_GUIDE](./documentation/USERS_GUIDE.md) file, to get a documentation or at the [online](http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2021.05.001.rc1/html/index.html) doxygen documentation, where you find the definition of the interfaces. You might want to have a look at the [PERFORMANCE tuning document](./documentation/PERFORMANCE_TUNING.md) to avoid some usual pitfalls.
## Contributing to *ELPA*
......
......@@ -136,12 +136,29 @@ then
if [ "$gpuJob" == "yes" ]
then
cp $HOME/runners/job_script_templates/run_${CLUSTER}_1node_2GPU.sh .
echo "if \[ \$SLURM_PROCID -eq 0 \]" >> ./run_${CLUSTER}_1node_GPU.sh
echo "then" >> ./run_${CLUSTER}_1node_2GPU.sh
echo "echo \"process \$SLURM_PROCID running configure\"" >> ./run_${CLUSTER}_1node_2GPU.sh
echo "#decouple from SLURM (maybe this could be removed)" >> ./run_${CLUSTER}_1node_2GPU.sh
echo "export _save_SLURM_MPI_TYPE=\$SLURM_MPI_TYPE" >> ./run_${CLUSTER}_1node_2GPU.sh
echo "export _save_I_MPI_SLURM_EXT=\$I_MPI_SLURM_EXT" >> ./run_${CLUSTER}_1node_2GPU.sh
echo "export _save_I_MPI_PMI_LIBRARY=\$I_MPI_PMI_LIBRARY" >> ./run_${CLUSTER}_1node_2GPU.sh
echo "export _save_I_MPI_PMI2=\$I_MPI_PMI2" >> ./run_${CLUSTER}_1node_2GPU.sh
echo "export _save_I_MPI_HYDRA_BOOTSTRAP=\$I_MPI_HYDRA_BOOTSTRAP" >> ./run_${CLUSTER}_1node_2GPU.sh
echo "unset SLURM_MPI_TYPE I_MPI_SLURM_EXT I_MPI_PMI_LIBRARY I_MPI_PMI2 I_MPI_HYDRA_BOOTSTRAP" >> ./run_${CLUSTER}_1node_2GPU.sh
echo " " >> ./run_${CLUSTER}_1node_2GPU.sh
echo "./configure " "$configureArgs" >> ./run_${CLUSTER}_1node_2GPU.sh
echo " " >> ./run_${CLUSTER}_1node_2GPU.sh
echo "make -j 16" >> ./run_${CLUSTER}_1node_2GPU.sh
echo "touch build_done" >> ./run_${CLUSTER}_1node_2GPU.sh
echo "fi" >> ./run_${CLUSTER}_1node_2GPU.sh
echo " " >> ./run_${CLUSTER}_1node_2GPU.sh
echo "export OMP_NUM_THREADS=$ompThreads" >> ./run_${CLUSTER}_1node_2GPU.sh
echo "export TASKS=$mpiTasks" >> ./run_${CLUSTER}_1node_2GPU.sh
#echo "while ! \[ -f ./build_done \];" >> ./run_${CLUSTER}_1node_2GPU.sh
#echo "do" >> ./run_${CLUSTER}_1node_2GPU.sh
#echo "echo \""\ > /dev/null" >> ./run_${CLUSTER}_1node_2GPU.sh
#echo "done" >> ./run_${CLUSTER}_1node_2GPU.sh
echo "make check TEST_FLAGS=\" $matrixSize $nrEV $blockSize \" " >> ./run_${CLUSTER}_1node_2GPU.sh
echo " " >> ./run_${CLUSTER}_1node_2GPU.sh
echo "exitCode=\$?" >> ./run_${CLUSTER}_1node_2GPU.sh
......@@ -174,12 +191,30 @@ then
if [[ "$CI_RUNNER_TAGS" =~ "sse" ]] || [[ "$CI_RUNNER_TAGS" =~ "avx" ]] || [[ "$CI_RUNNER_TAGS" =~ "avx2" ]] || [ ["$CI_RUNNER_TAGS" =~ "avx512" ]]
then
cp $HOME/runners/job_script_templates/run_${CLUSTER}_1node.sh .
echo " " >> ./run_${CLUSTER}_1node.sh
echo "if \[ \$SLURM_PROCID -eq 0 \]" >> ./run_${CLUSTER}_1node.sh
echo "then" >> ./run_${CLUSTER}_1node.sh
echo "echo \"process \$SLURM_PROCID running configure\"" >> ./run_${CLUSTER}_1node.sh
echo "\#decouple from SLURM \(maybe this could be removed\)" >> ./run_${CLUSTER}_1node.sh
echo "export _save_SLURM_MPI_TYPE=\$SLURM_MPI_TYPE" >> ./run_${CLUSTER}_1node.sh
echo "export _save_I_MPI_SLURM_EXT=\$I_MPI_SLURM_EXT" >> ./run_${CLUSTER}_1node.sh
echo "export _save_I_MPI_PMI_LIBRARY=\$I_MPI_PMI_LIBRARY" >> ./run_${CLUSTER}_1node.sh
echo "export _save_I_MPI_PMI2=\$I_MPI_PMI2" >> ./run_${CLUSTER}_1node.sh
echo "export _save_I_MPI_HYDRA_BOOTSTRAP=\$I_MPI_HYDRA_BOOTSTRAP" >> ./run_${CLUSTER}_1node.sh
echo "unset SLURM_MPI_TYPE I_MPI_SLURM_EXT I_MPI_PMI_LIBRARY I_MPI_PMI2 I_MPI_HYDRA_BOOTSTRAP" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "./configure " "$configureArgs" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "make -j 16 " >> ./run_${CLUSTER}_1node.sh
echo "touch build_done" >> ./run_${CLUSTER}_1node.sh
echo "fi" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "export OMP_NUM_THREADS=$ompThreads" >> ./run_${CLUSTER}_1node.sh
echo "export TASKS=$mpiTasks" >> ./run_${CLUSTER}_1node.sh
echo "while ! [ -f ./build_done ];" >> ./run_${CLUSTER}_1node.sh
echo "do" >> ./run_${CLUSTER}_1node.sh
echo "echo \" \" > /dev/null" >> ./run_${CLUSTER}_1node.sh
echo "done" >> ./run_${CLUSTER}_1node.sh
echo "make check TEST_FLAGS=\" $matrixSize $nrEV $blockSize \" " >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "exitCode=\$?" >> ./run_${CLUSTER}_1node.sh
......
......@@ -121,7 +121,20 @@ then
if [[ "$CI_RUNNER_TAGS" =~ "distcheck" ]]
then
cp $HOME/runners/job_script_templates/run_${CLUSTER}_1node.sh .
echo " " >> ./run_${CLUSTER}_1node.sh
echo "if [ \$SLURM_PROCID -eq 0 ]" >> ./run_${CLUSTER}_1node.sh
echo "then" >> ./run_${CLUSTER}_1node.sh
echo "echo \"process \$SLURM_PROCID running configure\"" >> ./run_${CLUSTER}_1node.sh
echo "#decouple from SLURM (maybe this could be removed)" >> ./run_${CLUSTER}_1node.sh
echo "export _save_SLURM_MPI_TYPE=\$SLURM_MPI_TYPE" >> ./run_${CLUSTER}_1node.sh
echo "export _save_I_MPI_SLURM_EXT=\$I_MPI_SLURM_EXT" >> ./run_${CLUSTER}_1node.sh
echo "export _save_I_MPI_PMI_LIBRARY=\$I_MPI_PMI_LIBRARY" >> ./run_${CLUSTER}_1node.sh
echo "export _save_I_MPI_PMI2=\$I_MPI_PMI2" >> ./run_${CLUSTER}_1node.sh
echo "export _save_I_MPI_HYDRA_BOOTSTRAP=\$I_MPI_HYDRA_BOOTSTRAP" >> ./run_${CLUSTER}_1node.sh
echo "unset SLURM_MPI_TYPE I_MPI_SLURM_EXT I_MPI_PMI_LIBRARY I_MPI_PMI2 I_MPI_HYDRA_BOOTSTRAP" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "./configure " "$configureArgs" " || { cat config.log; exit 1; }" >> ./run_${CLUSTER}_1node.sh
echo "fi" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "export TASKS=$mpiTasks" >> ./run_${CLUSTER}_1node.sh
echo "export DISTCHECK_CONFIGURE_FLAGS=\" $distcheckConfigureArgs \" " >> ./run_${CLUSTER}_1node.sh
......
......@@ -135,16 +135,29 @@ then
echo "mkdir -p build" >> ./run_${CLUSTER}_1node.sh
echo "pushd build" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "if [ \$SLURM_PROCID -eq 0 ]" >> ./run_${CLUSTER}_1node.sh
echo "then" >> ./run_${CLUSTER}_1node.sh
echo "echo \"process \$SLURM_PROCID running configure\"" >> ./run_${CLUSTER}_1node.sh
echo "#decouple from SLURM (maybe this could be removed)" >> ./run_${CLUSTER}_1node.sh
echo "export _save_SLURM_MPI_TYPE=\$SLURM_MPI_TYPE" >> ./run_${CLUSTER}_1node.sh
echo "export _save_I_MPI_SLURM_EXT=\$I_MPI_SLURM_EXT" >> ./run_${CLUSTER}_1node.sh
echo "export _save_I_MPI_PMI_LIBRARY=\$I_MPI_PMI_LIBRARY" >> ./run_${CLUSTER}_1node.sh
echo "export _save_I_MPI_PMI2=\$I_MPI_PMI2" >> ./run_${CLUSTER}_1node.sh
echo "export _save_I_MPI_HYDRA_BOOTSTRAP=\$I_MPI_HYDRA_BOOTSTRAP" >> ./run_${CLUSTER}_1node.sh
echo "unset SLURM_MPI_TYPE I_MPI_SLURM_EXT I_MPI_PMI_LIBRARY I_MPI_PMI2 I_MPI_HYDRA_BOOTSTRAP" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "#Running autogen " >> ./run_${CLUSTER}_1node.sh
echo "../autogen.sh" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "#Running configure " >> ./run_${CLUSTER}_1node.sh
echo "../configure " "$configureArgs" " || { cat config.log; exit 1; }" >> ./run_${CLUSTER}_1node.sh
echo "#Running make " >> ./run_${CLUSTER}_1node.sh
echo "make -j 8 || { exit 1; }" >> ./run_${CLUSTER}_1node.sh
echo "touch build_done" >> ./run_${CLUSTER}_1node.sh
echo "fi" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "export TASKS=$mpiTasks" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "#Running make " >> ./run_${CLUSTER}_1node.sh
echo "make -j 8 || { exit 1; }" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "#Running make install" >> ./run_${CLUSTER}_1node.sh
echo "make install || { exit 1; }" >> ./run_${CLUSTER}_1node.sh
......
......@@ -398,6 +398,9 @@ if test x"${enable_openmp}" = x"yes"; then
FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS"
fi
#AC_LANG_POP([Fortran])
want_mpi_launcher="no"
AC_MSG_CHECKING(whether mpi-launcher should be detected)
AC_ARG_ENABLE(detect-mpi-launcher,
......@@ -596,6 +599,8 @@ if test x"$can_compile_with_mkl" = x"yes" ; then
AC_MSG_RESULT([${have_mkl}])
fi
#AC_LANG_POP([Fortran])
dnl if not mkl, check all the necessary individually
if test x"${have_mkl}" = x"yes" ; then
WITH_MKL=1
......@@ -657,6 +662,7 @@ else
AC_MSG_ERROR([could not link with scalapack: specify path])
fi
fi
AC_LANG_PUSH([Fortran])
dnl check whether we can link alltogehter
AC_MSG_CHECKING([whether we can link a Fortran program with all blacs/scalapack])
......@@ -704,7 +710,7 @@ AC_MSG_RESULT([${fortran_can_check_environment}])
if test x"${fortran_can_check_environment}" = x"yes" ; then
AC_DEFINE([HAVE_ENVIRONMENT_CHECKING],[1],[Fortran can query environment variables])
fi
AC_LANG_POP([Fortran])
dnl check whether BAND_TO_FULL_BLOCKING is set
AC_MSG_CHECKING(whether BAND_TO_FLULL_BLOCKING is requested)
......@@ -754,7 +760,7 @@ if test x"${user_sets_nvidia_gpu_compute_capability}" = x"yes" ; then
fi
fi
AC_LANG_PUSH([Fortran])
dnl Test possibility of 'use mpi', if requested
if test x"${with_mpi}" = x"yes" ; then
AC_ARG_ENABLE([mpi-module],
......@@ -788,6 +794,119 @@ if test x"${with_mpi}" = x"yes" ; then
fi
fi
fi
if test x"$with_mpi" = x"yes" && test x"$enable_openmp" = x"yes"; then
AC_MSG_CHECKING(whether the threading support of the MPI library should be checked during RUNTIME)
AC_ARG_ENABLE([runtime-threading-support-checks],
AS_HELP_STRING([--disable-runtime-threading-support-checks],
[do not check at runtime the required threading support of the MPI library. DISABLE ONLY AT YOUR OWN RISK! (default: on)]),
[
if test x"$enableval" = x"yes"; then
enable_runtime_threading_support_checks=yes
else
enable_runtime_threading_support_checks=no
fi
],
[enable_runtime_threading_support_checks=yes])
AC_MSG_RESULT([${enable_runtime_threading_support_checks}])
if test x"${enable_runtime_threading_support_checks}" = x"yes" ; then
AC_DEFINE([THREADING_SUPPORT_CHECK],[1],[can check at runtime the threading support level of MPI])
fi
AC_MSG_CHECKING(whether ELPA is allowed to limit the number of OpenMP threads at runtime)
AC_ARG_ENABLE([allow-thread-limiting],
AS_HELP_STRING([--enable-allow-thread-limiting],
[do a runtime check whether threading support of the MPI library is sufficient. If not ELPA will limit the number of OpenMP threads to 1 during the run]),
[
if test x"$enableval" = x"yes"; then
enable_allow_thread_limiting=yes
else
enable_allow_thread_limiting=no
fi
],
[enable_allow_thread_limiting=yes])
AC_MSG_RESULT([${enable_allow_thread_limiting}])
if test x"${enable_allow_thread_limiting}" = x"yes" ; then
AC_DEFINE([ALLOW_THREAD_LIMITING],[1],[ELPA can at runtime limit the number of OpenMP threads to 1 if needed])
fi
#consistency check
if test x"${enable_allow_thread_limiting}" = x"yes" && test x"${enable_runtime_threading_support_checks}" = x"no"; then
AC_MSG_ERROR([You cannot set --enable-allow-thread-limiting and --disable-runtime-threading-support-checks at the same time. Thread limiting needs runtime support checks!])
fi
if test x"${enable_allow_thread_limiting}" = x"no" && test x"${enable_runtime_threading_support_checks}" = x"yes"; then
AC_MSG_NOTICE([You set --disable-allow-thread-limiting and --enable-runtime-threading-support-checks. If ELPA detects during a run that])
AC_MSG_NOTICE([your MPI library does not provide a sufficient level of threading support, ELPA will only _print_ a warning and continue])
AC_MSG_NOTICE([This might lead to undefined behavior, includig wrong results])
fi
AC_ARG_WITH([threading-support-check-during-build],[AS_HELP_STRING([--with-threading-support-check-during-build],[Do checks at build time whether the MPI threading level support is sufficient. (default: on)])],
[
if test x"$withval" = x"yes"; then
with_threading_support_check_during_build=yes
else
with_threading_support_check_during_build=no
fi
],
[with_threading_support_check_during_build=yes])
fi
if test x"${enable_openmp}" = x"yes" && test x"${with_mpi}" = x"yes" && test x"${with_threading_support_check_during_build}" = x"yes"; then
mpi_threading_level_sufficient=no
AC_MSG_NOTICE([**************************************************************************************************************************])
AC_MSG_NOTICE([* Please notice if the following step hangs or aborts abnormaly then you cannot run a short MPI-program during configure *])
AC_MSG_NOTICE([* In this case please re-run configure with '--without-threading-support-check-during-build' _AND_ follow the hints in *])
AC_MSG_NOTICE([* the INSTALL and USER_GUIDE documents! *])
AC_MSG_NOTICE([* In case you get some other warnings about threading support follow on of the steps detailed there *])
AC_MSG_NOTICE([**************************************************************************************************************************])
AC_MSG_CHECKING(what threading level is supported by the MPI library)
AC_RUN_IFELSE([AC_LANG_SOURCE([
program testit
use iso_c_binding
implicit none
include "mpif.h"
integer(kind=c_int) :: provided, error, status
status = 1
call mpi_init_thread(MPI_THREAD_MULTIPLE, provided,error)
!call mpi_init(error)
!call mpi_query_thread(provided, error)
if (provided .eq. MPI_THREAD_SERIALIZED .or. provided .eq. MPI_THREAD_MULTIPLE) then
status = 0
endif
call mpi_finalize(error)
call exit(status)
end
])],
[mpi_threading_level_sufficient=yes],
[mpi_threading_level_sufficient=no],
[mpi_threading_level_sufficient=yes]
)
AC_MSG_RESULT([${mpi_threading_level_sufficient}])
if test x"${mpi_threading_level_sufficient}" = x"yes" ; then
AC_DEFINE([HAVE_SUFFICIENT_MPI_THREADING_SUPPORT],[1],[MPI threading support is sufficient])
else
AC_MSG_WARN([Your MPI implementation does not provide a sufficient threading level for OpenMP])
AC_MSG_WARN([You do have several options:])
AC_MSG_WARN([ * disable OpenMP (--disable-openmp): this will ensure correct results, but maybe some performance drop])
AC_MSG_WARN([ * use an MPI-library with the required threading support level (see the INSTALL and USER_GUIDE): this will ])
AC_MSG_WARN([ ensure correct results and best performance])
AC_MSG_WARN([ * allow ELPA at runtime to change the number of threads to 1 by setting "--enable-runtime-threading-support-checks])
AC_MSG_WARN([ --enable-allow-thread-limiting --without-threading-support-check-during-build": this will ensure correct results, but ])
AC_MSG_WARN([ maybe not the best performance (depends on the threading of your blas/lapack libraries), see the USER_GUIDE])
AC_MSG_WARN([ * switch of the checking of threading support "--disable-runtime-threading-support-checks ])
AC_MSG_WARN([ --without-threading-support-check-during-build: DO THIS AT YOUR OWN RISK! This will be fast, but might])
AC_MSG_WARN([ (depending on your MPI library sometimes) lead to wrong results])
AC_MSG_ERROR([You do have to take an action of the choices above!])
fi
fi
AC_LANG_POP([Fortran])
dnl Assemble the list of kernels to build
......@@ -2020,8 +2139,6 @@ fi
AM_CONDITIONAL([WANT_SINGLE_PRECISION_REAL],[test x"$want_single_precision" = x"yes"])
AM_CONDITIONAL([WANT_SINGLE_PRECISION_COMPLEX],[test x"$want_single_precision" = x"yes"])
#always define SKEWSYMMETRIC for the moment
AC_MSG_CHECKING(whether we should enable skew-symmetric support)
AC_ARG_ENABLE([skew-symmetric-support],
AS_HELP_STRING([--enable-skew-symmetric-support],
......@@ -2031,7 +2148,7 @@ AC_ARG_ENABLE([skew-symmetric-support],
else
enable_skewsymmetric=no
fi],
[enable_skewsymmetric=no])
[enable_skewsymmetric=yes])
AC_MSG_RESULT([${enable_skewsymmetric}])
AM_CONDITIONAL([HAVE_SKEWSYMMETRIC],[test x"$enable_skewsymmetric" = x"yes"])
if test x"${enable_skewsymmetric}" = x"yes"; then
......@@ -2331,6 +2448,13 @@ else
make -f $srcdir/generated_headers.am generated-headers top_srcdir="$srcdir" CPP="$CPP"
fi
if test x"$enable_threading_support_checks" = x"no" && test x"$with_mpi" = x"yes" && test x"${enable_openmp}" = x"yes"; then
echo " "
echo " You disabled the checking whether your MPI library offers a sufficient level of threading support!"
echo " You 'convince' ELPA that everything is ok, do not complain about problems with ELPA in this build!"
echo " "
fi
if test x"$old_elpa_version" = x"yes"; then
echo " "
echo " It is possible that your current version of ELPA is not the latest one."
......
This diff is collapsed.
......@@ -197,6 +197,13 @@ The following compute routines are available in *ELPA*: Please have a look at th
## IV) Using OpenMP threading ##
IMPORTANT: In case of hybrid MPI and OpenMP builds it is **mandatory** that your MPI library supports the threading levels "MPI_THREAD_SERIALIZED" or
"MPI_THREAD_MULTIPLE" (you can check this for example by building ELPA with MPI and OpenMP and run one of the test programs, they will warn you
if this prerequiste is not met). If your MPI library does **not** provide these threading levels, then ELPA will internally (independent of what you
set) use only **one** OpenMP thread and inform you at runtime with a warning. The number of threads used in a threaded implementation of your BLAS library
are not affected by this, as long as these threads can be controlled with another method than specifying OMP_NUM_THREADS (for instance with Intel's MKL
libray you can specify MKL_NUM_THREADS).
If *ELPA* has been build with OpenMP threading support you can specify the number of OpenMP threads that *ELPA* will use internally.
Please note that it is **mandatory** to set the number of threads to be used with the OMP_NUM_THREADS environment variable **and**
with the **set method**
......@@ -376,6 +383,8 @@ to be done in the application using MPI which wants to call ELPA, namely
- Initializing the MPI
- creating a blacs distributed matrix
- IMPORTANT: it is very, very important that you check the return value of "descinit" of your blacs distribution!
ELPA relies that the distribution it should work on is _valid_. If this is not the case the behavior is undefined!
- using this matrix within ELPA
The skeleton is not ment to be copied and pasted, since the details will always be dependent on the application which should
......@@ -438,6 +447,12 @@ call BLACS_Gridinfo( my_blacs_ctxt, nprow, npcol, my_prow, my_pcol )
call descinit( sc_desc, na, na, nblk, nblk, 0, 0, my_blacs_ctxt, na_rows, info )
! check the return code
if (info .ne. 0) then
print *,"Invalid blacs-distribution. Abort!"
stop
endif
! Allocate matrices
allocate(a (na_rows,na_cols))
......
......@@ -18,6 +18,9 @@ typedef struct elpa_autotune_struct *elpa_autotune_t;
#include <elpa/elpa_generated.h>
#include <elpa/elpa_generic.h>
#define ELPA_2STAGE_REAL_GPU ELPA_2STAGE_REAL_NVIDIA_GPU
#define ELPA_2STAGE_COMPLEX_GPU ELPA_2STAGE_COMPLEX_NVIDIA_GPU
const char *elpa_strerr(int elpa_error);
#endif
......@@ -76,7 +76,7 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
sorted(split_comm_flag.keys())):
if gid == 1 and (g == 0 ):
if gid == 1 and (g == "GPU_OFF" ):
continue
if lang == "C" and (m == "analytic" or m == "toeplitz" or m == "frank" or lay == "all_layouts"):
......@@ -362,7 +362,8 @@ print(" " + " \\\n ".join([
prec_flag['double']]))
print("endif")
name = "test_skewsymmetric_real_double"
name = "validate_skewsymmetric_real_double"
print("if HAVE_SKEWSYMMETRIC")
print("check_SCRIPTS += " + name + "_extended.sh")
print("noinst_PROGRAMS += " + name)
print(name + "_SOURCES = test/Fortran/test_skewsymmetric.F90")
......@@ -371,8 +372,10 @@ print(name + "_FCFLAGS = $(test_program_fcflags) \\")
print(" " + " \\\n ".join([
domain_flag['real'],
prec_flag['double']]))
print("endif")
name = "test_skewsymmetric_real_single"
name = "validate_skewsymmetric_real_single"
print("if HAVE_SKEWSYMMETRIC")
print("if WANT_SINGLE_PRECISION_REAL")
print("check_SCRIPTS += " + name + "_extended.sh")
print("noinst_PROGRAMS += " + name)
......@@ -383,6 +386,7 @@ print(" " + " \\\n ".join([
domain_flag['real'],
prec_flag['single']]))
print("endif")
print("endif")
......
......@@ -100,6 +100,13 @@
!> \brief Abstract definition of the elpa_t type
!>
!>
!> Since ELPA needs (in case of MPI builds) that the matix is block-cyclic distributed
!> the user has to ensure this distribution _before_ calling ELPA.
!> Experience shows, that it is very important that the user checks the return code of
!> 'descinit' to check whether the block-cyclic distribution is valid.
!> Note that ELPA relies on a valid block-cyclic distribution and might show unexpected
!> behavior if this has not been ensured before calling ELPA.
!>
!> A typical usage of ELPA might look like this:
!>
!> Fortran synopsis
......
......@@ -105,6 +105,16 @@ module elpa1_impl
public :: elpa_solve_evp_complex_1stage_single_impl !< Driver routine for complex 1-stage eigenvalue problem
#endif
#ifdef HAVE_SKEWSYMMETRIC
public :: elpa_solve_skew_evp_real_1stage_double_impl !< Driver routine for real double-precision 1-stage skew-symmetric eigenvalue problem
#ifdef WANT_SINGLE_PRECISION_REAL
public :: elpa_solve_skew_evp_real_1stage_single_impl !< Driver routine for real single-precision 1-stage skew-symmetric eigenvalue problem
#endif
#endif /* HAVE_SKEWSYMMETRIC */
! imported from elpa1_auxilliary
public :: elpa_mult_at_b_real_double_impl !< Multiply double-precision real matrices A**T * B
......@@ -168,6 +178,7 @@ contains
!> \result success
#define REALCASE 1
#define DOUBLE_PRECISION 1
#undef ACTIVATE_SKEW
#include "../general/precision_macros.h"
#include "elpa1_template.F90"
#undef REALCASE
......@@ -205,6 +216,7 @@ contains
#define REALCASE 1
#define SINGLE_PRECISION 1
#undef ACTIVATE_SKEW
#include "../general/precision_macros.h"
#include "elpa1_template.F90"
#undef REALCASE
......@@ -241,6 +253,7 @@ contains
!> \result success
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#undef ACTIVATE_SKEW
#include "../general/precision_macros.h"
#include "elpa1_template.F90"
#undef DOUBLE_PRECISION
......@@ -280,10 +293,93 @@ contains
#define COMPLEXCASE 1
#define SINGLE_PRECISION
#undef ACTIVATE_SKEW
#include "../general/precision_macros.h"
#include "elpa1_template.F90"
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#endif /* WANT_SINGLE_PRECISION_COMPLEX */
#ifdef HAVE_SKEWSYMMETRIC
!> \brief elpa_solve_skew_evp_real_1stage_double_impl: Fortran function to solve the real double-precision skew-symmetric eigenvalue problem with 1-stage solver
!>
!> \details
!> \param obj elpa_t object contains:
!> \param - obj%na Order of matrix
!> \param - obj%nev number of eigenvalues/vectors to be computed
!> The smallest nev eigenvalues/eigenvectors are calculated.
!> \param - obj%local_nrows Leading dimension of a
!> \param - obj%local_ncols local columns of matrix q
!> \param - obj%nblk blocksize of cyclic distribution, must be the same in both directions!
!> \param - obj%mpi_comm_rows MPI communicator for rows
!> \param - obj%mpi_comm_cols MPI communicator for columns
!> \param - obj%mpi_comm_parent MPI communicator for columns
!> \param - obj%gpu use GPU version (1 or 0)
!>
!> \param a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed.
!> Distribution is like in Scalapack.
!> The full matrix must be set (not only one half like in scalapack).
!> Destroyed on exit (upper and lower half).
!>
!> \param ev(na) On output: eigenvalues of a, every processor gets the complete set
!>
!> \param q(ldq,matrixCols) On output: Eigenvectors of a
!> Distribution is like in Scalapack.
!> Must be always dimensioned to the full size (corresponding to (na,na))
!> even if only a part of the eigenvalues is needed.
!>
!>
!> \result success
#define REALCASE 1
#define DOUBLE_PRECISION 1
#define ACTIVATE_SKEW
#include "../general/precision_macros.h"
#include "elpa1_template.F90"
#undef ACTIVATE_SKEW
#undef REALCASE
#undef DOUBLE_PRECISION
#ifdef WANT_SINGLE_PRECISION_REAL
!> \brief elpa_solve_evp_real_1stage_single_impl: Fortran function to solve the real single-precision eigenvalue problem with 1-stage solver
!> \details
!> \param obj elpa_t object contains:
!> \param - obj%na Order of matrix
!> \param - obj%nev number of eigenvalues/vectors to be computed
!> The smallest nev eigenvalues/eigenvectors are calculated.
!> \param - obj%local_nrows Leading dimension of a
!> \param - obj%local_ncols local columns of matrix q
!> \param - obj%nblk blocksize of cyclic distribution, must be the same in both directions!
!> \param - obj%mpi_comm_rows MPI communicator for rows
!> \param - obj%mpi_comm_cols MPI communicator for columns
!> \param - obj%mpi_comm_parent MPI communicator for columns
!> \param - obj%gpu use GPU version (1 or 0)
!>
!> \param a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed.
!> Distribution is like in Scalapack.
!> The full matrix must be set (not only one half like in scalapack).
!> Destroyed on exit (upper and lower half).
!>
!> \param ev(na) On output: eigenvalues of a, every processor gets the complete set
!>
!> \param q(ldq,matrixCols) On output: Eigenvectors of a
!> Distribution is like in Scalapack.
!> Must be always dimensioned to the full size (corresponding to (na,na))
!> even if only a part of the eigenvalues is needed.
!>
!>
!> \result success
#define REALCASE 1
#define SINGLE_PRECISION 1
#define ACTIVATE_SKEW
#include "../general/precision_macros.h"
#include "elpa1_template.F90"
#undef REALCASE
#undef ACTIVATE_SKEW
#undef SINGLE_PRECISION
#endif /* WANT_SINGLE_PRECISION_REAL */
#endif /* HAVE_SKEWSYMMETRIC */
end module ELPA1_impl
......@@ -55,11 +55,19 @@
#include "../general/sanity.F90"
#include "../general/error_checking.inc"
#ifdef ACTIVATE_SKEW
function elpa_solve_skew_evp_&
&MATH_DATATYPE&
&_1stage_&
&PRECISION&
&_impl (obj, &
#else
function elpa_solve_evp_&
&MATH_DATATYPE&
&_1stage_&
&PRECISION&
&_impl (obj, &
#endif
#ifdef REDISTRIBUTE_MATRIX
aExtern, &
#else
......@@ -98,7 +106,7 @@ function elpa_solve_evp_&
MATH_DATATYPE(kind=rck), optional,target,intent(out) :: qExtern(obj%local_nrows,*)
#else
MATH_DATATYPE(kind=rck), intent(inout), target :: aExtern(obj%local_nrows,obj%local_ncols)
#ifdef HAVE_SKEWSYMMETRIC
#ifdef ACTIVATE_SKEW
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: qExtern(obj%local_nrows,2*obj%local_ncols)
#else
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: qExtern(obj%local_nrows,obj%local_ncols)
......@@ -112,7 +120,7 @@ function elpa_solve_evp_&
MATH_DATATYPE(kind=rck), optional,target,intent(out) :: q(obj%local_nrows,*)
#else
MATH_DATATYPE(kind=rck), intent(inout), target :: a(obj%local_nrows,obj%local_ncols)
#ifdef HAVE_SKEWSYMMETRIC
#ifdef ACTIVATE_SKEW
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,2*obj%local_ncols)
#else
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,obj%local_ncols)
......@@ -182,12 +190,16 @@ function elpa_solve_evp_&
integer(kind=c_int) :: pinningInfo
logical :: do_tridiag, do_solve, do_trans_ev
integer(kind=