Commit 82ad3340 authored by Andreas Marek's avatar Andreas Marek
Browse files

Check the provided threading level of the MPI library

It turned out that users ignored the warnings of the test programs
if their MPI library did not provide "MPI_THREAD_SERIALIED" or
"MPI_THREAD_MULTIPLE".

Thus for safty reasons, ELPA does from now on during the call to
"elpa_setup" check the provided threading level of the MPI library.
If the provided level is too low, ELPA will do the following

- limit the number of OpenMP threads **internal** to ELPA to 1
- print a warning about this
- ignore the settings of the user via
-- the OMP_NUM_THREADS variable
-- calling the set-method set("omp_threads", value)

These settings will **not** affect the run-time of a threaded
blas and lapack library, **if** the number of threads for these
libraries can be controlled independently of the OMP_NUM_THREADS
variable (for example for Intel's MKL one can use MKL_NUM_THREADS)
parent 14f16b59
......@@ -20,6 +20,7 @@ issues
- ELPA_SETUPS does now (in case of MPI-runs) check whether the user-provided BLACSGRID is reasonable (i.e. ELPA does
_not_ rely anymore that the user does check prior to calling ELPA whether the BLACSGRID is ok) if this check fails
then ELPA returns with an error
- limit number of OpenMP threads to one, if MPI thread level is not at least MPI_THREAD_SERIALIZED
Changelog for ELPA 2020.11.001
......
......@@ -197,6 +197,13 @@ The following compute routines are available in *ELPA*: Please have a look at th
## IV) Using OpenMP threading ##
IMPORTANT: In case of hybrid MPI and OpenMP builds it is **mandatory** that your MPI library supports the threading levels "MPI_THREAD_SERIALIZED" or
"MPI_THREAD_MULTIPLE" (you can check this for example by building ELPA with MPI and OpenMP and run one of the test programs, they will warn you
if this prerequiste is not met). If your MPI library does **not** provide these threading levels, then ELPA will internally (independent of what you
set) use only **one** OpenMP thread and inform you at runtime with a warning. The number of threads used in a threaded implementation of your BLAS library
are not affected by this, as long as these threads can be controlled with another method than specifying OMP_NUM_THREADS (for instance with Intel's MKL
libray you can specify MKL_NUM_THREADS).
If *ELPA* has been build with OpenMP threading support you can specify the number of OpenMP threads that *ELPA* will use internally.
Please note that it is **mandatory** to set the number of threads to be used with the OMP_NUM_THREADS environment variable **and**
with the **set method**
......
......@@ -190,7 +190,7 @@ function elpa_solve_evp_&
integer(kind=c_int) :: pinningInfo
logical :: do_tridiag, do_solve, do_trans_ev
integer(kind=ik) :: nrThreads
integer(kind=ik) :: nrThreads, limitThreads
integer(kind=ik) :: global_index
logical :: reDistributeMatrix, doRedistributeMatrix
......@@ -225,8 +225,14 @@ function elpa_solve_evp_&
omp_threads_caller = omp_get_max_threads()
! check the number of threads that ELPA should use internally
call obj%get("omp_threads",nrThreads,error)
call omp_set_num_threads(nrThreads)
call obj%get("limit_openmp_threads",limitThreads,error)
if (limitThreads .eq. 0) then
call obj%get("omp_threads",nrThreads,error)
call omp_set_num_threads(nrThreads)
else
nrThreads = 1
call omp_set_num_threads(nrThreads)
endif
#else
nrThreads = 1
#endif
......
......@@ -74,7 +74,7 @@
logical :: success
integer(kind=ik) :: istat, debug, error
character(200) :: errorMessage
integer(kind=ik) :: nrThreads
integer(kind=ik) :: nrThreads, limitThreads
call obj%timer%start("elpa_cholesky_&
&MATH_DATATYPE&
......@@ -88,8 +88,15 @@
omp_threads_caller = omp_get_max_threads()
! check the number of threads that ELPA should use internally
call obj%get("omp_threads",nrThreads,error)
call omp_set_num_threads(nrThreads)
call obj%get("limit_openmp_threads",limitThreads,error)
if (limitThreads .eq. 0) then
call obj%get("omp_threads",nrThreads,error)
call omp_set_num_threads(nrThreads)
else
nrThreads = 1
call omp_set_num_threads(nrThreads)
endif
#else
nrThreads=1
#endif
......
......@@ -80,7 +80,7 @@
logical :: success
integer :: debug, error
integer :: nrThreads
integer :: nrThreads, limitThreads
call obj%timer%start("elpa_solve_tridi_public_&
&MATH_DATATYPE&
......@@ -99,8 +99,14 @@
omp_threads_caller = omp_get_max_threads()
! check the number of threads that ELPA should use internally
call obj%get("omp_threads",nrThreads,error)
call obj%get("limit_openmp_threads",limitThreads,error)
if (limitThreads .eq. 0) then
call obj%get("omp_threads",nrThreads,error)
call omp_set_num_threads(nrThreads)
else
nrThreads = 1
call omp_set_num_threads(nrThreads)
endif
#else
nrThreads=1
#endif
......
......@@ -201,7 +201,7 @@
do_trans_to_band, do_trans_to_full
logical :: good_nblk_gpu
integer(kind=ik) :: nrThreads
integer(kind=ik) :: nrThreads, limitThreads
#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
integer(kind=c_int) :: simdSetAvailable(NUMBER_OF_INSTR)
#endif
......@@ -262,9 +262,16 @@
! restore this at the end of ELPA 2
omp_threads_caller = omp_get_max_threads()
! check the number of threads that ELPA should use internally
call obj%get("omp_threads",nrThreads,error)
call omp_set_num_threads(nrThreads)
! check the number of threads that ELPA should use internally
call obj%get("limit_openmp_threads",limitThreads,error)
if (limitThreads .eq. 0) then
call obj%get("omp_threads",nrThreads,error)
call omp_set_num_threads(nrThreads)
else
nrThreads = 1
call omp_set_num_threads(nrThreads)
endif
#else
nrThreads = 1
#endif
......
......@@ -573,7 +573,7 @@ module elpa_impl
present_np_rows, present_np_cols, np_total
integer(kind=MPI_KIND) :: mpierr, mpierr2, my_idMPI, np_totalMPI, process_rowMPI, process_colMPI
integer(kind=MPI_KIND) :: mpi_comm_rowsMPI, mpi_comm_colsMPI, np_rowsMPI, np_colsMPI, &
mpi_string_lengthMPI, my_pcolMPI, my_prowMPI
mpi_string_lengthMPI, my_pcolMPI, my_prowMPI, providedMPI
character(len=MPI_MAX_ERROR_STRING) :: mpierr_string
integer(kind=BLAS_KIND) :: numroc_resultBLAS
integer(kind=c_int) :: info, na, nblk, na_rows, my_pcol, my_prow, numroc_result
......@@ -634,6 +634,19 @@ module elpa_impl
endif
endif
#ifdef WITH_OPENMP_TRADITIONAL
! check the threading level supported by the MPI library
call mpi_query_thread(providedMPI, mpierr)
if ((providedMPI .ne. MPI_THREAD_SERIALIZED) .or. (providedMPI .ne. MPI_THREAD_MULTIPLE)) then
write(error_unit,*) "WARNING elpa_setup: MPI threading level MPI_THREAD_SERALIZED or MPI_THREAD_MULTIPLE required but &
&your implementation does not support this. The number of OpenMP threads within ELPA will be &
&limited to 1"
call self%set("limit_openmp_threads", 1, error)
if (check_elpa_set(error, ELPA_ERROR_SETUP)) return
endif
#endif
! Create communicators ourselves
if (self%is_set("process_row") == 1 .and. self%is_set("process_col") == 1) then
......
......@@ -286,7 +286,7 @@ static const elpa_index_int_entry_t int_entries[] = {
#endif
INT_ENTRY("cannon_buffer_size", "Increasing the buffer size might make it faster, but costs memory", 0, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY, \
cannon_buffer_size_cardinality, cannon_buffer_size_enumerate, cannon_buffer_size_is_valid, NULL, PRINT_YES),
//BOOL_ENTRY("qr", "Use QR decomposition, only used for ELPA_SOLVER_2STAGE, real case", 0, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_REAL),
BOOL_ENTRY("limit_openmp_threads", "Limit the number if openmp threads to 1", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_NO),
BOOL_ENTRY("qr", "Use QR decomposition, only used for ELPA_SOLVER_2STAGE, real case", 0, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_REAL, PRINT_YES),
BOOL_ENTRY("timings", "Enable time measurement", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
BOOL_ENTRY("debug", "Emit verbose debugging messages", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment