Commit 1d2b3806 authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'master_pre_stage' into 'master'

Master pre stage

See merge request !54
parents e6767067 1996d441
This diff is collapsed.
......@@ -27,7 +27,7 @@ autotools procedure. This is the **only supported way** how to build and install
If you obtained *ELPA* from the official git repository, you will not find
the needed configure script! You will have to create the configure script with autoconf.
the needed configure script! You will have to create the configure script with autoconf. You can also run the `autogen.sh` script that does this step for you.
## (A): Installing *ELPA* as library with configure ##
......
......@@ -69,6 +69,8 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/elpa1/mod_distribute_global_column.F90 \
src/elpa1/mod_v_add_s.F90 \
src/elpa1/mod_solve_secular_equation.F90 \
src/helpers/mod_thread_affinity.F90 \
src/helpers/check_thread_affinity.c \
src/elpa_index.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa_c_interface.c
......
......@@ -99,7 +99,7 @@ Nonetheless, we are grateful if you cite the following publications:
Yu, V.; Moussa, J.; Kus, P.; Marek, A.; Messmer, P.; Yoon, M.; Lederer, H.; Blum, V.
"GPU-Acceleration of the ELPA2 Distributed Eigensolver for Dense Symmetric and Hermitian Eigenproblems",
https://arxiv.org/abs/2002.10991
Computer Physics Communications, 262, 2021
If you use the new API and/or autotuning:
......
......@@ -610,9 +610,14 @@ coverage = {
#"knl" : "--enable-avx512",
#"power8" : " --enable-vsx --disable-sse --disable-sse-assembly --disable-avx --disable-avx2 --disable-avx512 --disable-mpi-module --with-GPU-compute-capability=sm_60 ",
#instruction_set = {
# "sse" : " --enable-sse --enable-sse-assembly",
# "avx" : " --enable-avx",
# "avx2" : " --enable-avx2",
# "avx512" : "--enable-avx512",
#}
instruction_set = {
"sse" : " --enable-sse --enable-sse-assembly",
"avx" : " --enable-avx",
"avx2" : " --enable-avx2",
"avx512" : "--enable-avx512",
}
......
......@@ -308,6 +308,7 @@ if test x"${FC_does_infer_interfaces}" = x"yes"; then
# argument checking or fail
# 2. no MPI case: switch of PACK_REAL_TO_COMPLEX
if test x"${with_mpi}" = x"yes"; then
AC_MSG_CHECKING(whether MPI module defines all interfaces )
# check whether MPI module defines all interfaces; not the case for Intel MPI!
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
program test_mpi_interfaces
......@@ -316,6 +317,8 @@ if test x"${FC_does_infer_interfaces}" = x"yes"; then
integer :: rank
integer :: buf(10)
integer :: ierr
real*8 :: a(2)
complex*16 :: b(2)
call MPI_Init(ierr)
call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr)
......@@ -324,19 +327,27 @@ if test x"${FC_does_infer_interfaces}" = x"yes"; then
buf(:) = 42;
end if
! this is OK
call MPI_Bcast(buf, 10, MPI_INT, 0, MPI_COMM_WORLD, ierr)
!! this is OK
!call MPI_Bcast(buf, 10, MPI_INT, 0, MPI_COMM_WORLD, ierr)
! Oops, wrong order here:
call MPI_Bcast(10, buf, MPI_INT, 0, MPI_COMM_WORLD, ierr)
!! Oops, wrong order here:
!call MPI_Bcast(10, buf, MPI_INT, 0, MPI_COMM_WORLD, ierr)
! if the correct interfaces exists in the MPI module
! this must work. If not and the compiler infers interfaces
! this will fail
call MPI_Bcast(a, 2, MPI_REAL8, 0, MPI_COMM_WORLD, ierr)
call MPI_Bcast(b, 2, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD, ierr)
call MPI_Finalize(ierr)
end program
])],
[MPImodule_has_interfaces=no],
[MPImodule_has_interfaces=yes]
[MPImodule_has_interfaces=yes],
[MPImodule_has_interfaces=no]
)
AC_MSG_RESULT([${MPImodule_has_interfaces}])
if test x"${MPImodule_has_interfaces}" = x"no"; then
AC_MSG_CHECKING(whether we can cure missing interfaces by allowing argument mismatch )
#check whether we can cure this by disabling argument checking
FCFLAGS_SAVE2="$FCFLAGS"
FCFLAGS="$FCFLAGS -fallow-argument-mismatch"
......@@ -353,10 +364,11 @@ if test x"${FC_does_infer_interfaces}" = x"yes"; then
end program
])],
[FC_does_still_infer_interfaces=no],
[FC_does_still_infer_interfaces=yes]
[FC_infer_interfaces_cured=yes],
[FC_infer_interfaces_cured=no]
)
if test x"${FC_does_still_infer_interfaces}" = x"yes"; then
AC_MSG_RESULT([${FC_infer_interfaces_cured}])
if test x"${FC_infer_interfaces_cured}" = x"no"; then
FCFLAGS = "$FCFLAGS_SAVE2"
AC_MSG_ERROR([Fortran compiler infers interfaces; but MPI module does not supply all of them])
fi
......
......@@ -106,7 +106,7 @@
!>
!> \code{.f90}
!> use elpa
!> class(elpa_t), pointer :: elpa
!> class(elpa_t), pointer :: elpaInstance
!> integer :: success
!>
!> ! We urge the user to always check the error code of all ELPA functions
......@@ -121,34 +121,34 @@
!> endif
!>
!> ! set parameters decribing the matrix and it's MPI distribution
!> call elpa%set("na", na, success, success)
!> call elpaIstance%set("na", na, success, success)
!> if (success /= ELPA_OK) then
!> print *,"Could not set entry"
!> endif
!> call elpa%set("nev", nev, success, success)
!> call elpaInstance%set("nev", nev, success, success)
!> ! check success code ...
!>
!> call elpa%set("local_nrows", na_rows, success)
!> call elpaInstance%set("local_nrows", na_rows, success)
!> ! check success code ...
!>
!> call elpa%set("local_ncols", na_cols, success)
!> call elpa%set("nblk", nblk, success)
!> call elpa%set("mpi_comm_parent", MPI_COMM_WORLD, success)
!> call elpa%set("process_row", my_prow, success)
!> call elpa%set("process_col", my_pcol, success)
!> call elpaInstance%set("local_ncols", na_cols, success)
!> call elpaInstance%set("nblk", nblk, success)
!> call elpaInstance%set("mpi_comm_parent", MPI_COMM_WORLD, success)
!> call elpaInstance%set("process_row", my_prow, success)
!> call elpaInstance%set("process_col", my_pcol, success)
!>
!> ! set up the elpa object
!> success = elpa%setup()
!> success = elpaInstance%setup()
!> if (succes /= ELPA_OK) then
!> print *,"Could not setup ELPA object"
!> endif
!>
!> ! if desired, set tunable run-time options
!> ! here we want to use the 2-stage solver
!> call elpa%set("solver", ELPA_SOLVER_2STAGE, success)
!> call elpaInstance%set("solver", ELPA_SOLVER_2STAGE, success)
!>
!> ! and set a specific kernel (must be supported on the machine)
!> call elpa%set("real_kernel", ELPA_2STAGE_REAL_AVX_BLOCK2)
!> call elpaInstance%set("real_kernel", ELPA_2STAGE_REAL_AVX_BLOCK2)
!> \endcode
!> ... set and get all other options that are desired
!> \code{.f90}
......@@ -159,7 +159,7 @@
!> ! use method solve to solve the eigenvalue problem to obtain eigenvalues
!> ! and eigenvectors
!> ! other possible methods are desribed in \ref elpa_api::elpa_t derived type
!> call elpa%eigenvectors(a, ev, z, success)
!> call elpaInstance%eigenvectors(a, ev, z, success)
!>
!> ! cleanup
!> call elpa_deallocate(e, success)
......
......@@ -83,6 +83,7 @@ function elpa_solve_evp_&
use elpa_scalapack_interfaces
#endif
use solve_tridi
use thread_affinity
implicit none
#include "../general/precision_kinds.F90"
class(elpa_abstract_impl_t), intent(inout) :: obj
......@@ -176,6 +177,8 @@ function elpa_solve_evp_&
MATH_DATATYPE(kind=rck), allocatable, target :: aIntern(:,:)
MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable, target :: qIntern(:,:)
#endif
integer(kind=c_int) :: pinningInfo
logical :: do_tridiag, do_solve, do_trans_ev
integer(kind=ik) :: nrThreads
integer(kind=ik) :: global_index
......@@ -216,8 +219,19 @@ function elpa_solve_evp_&
#ifdef WITH_NVTX
call nvtxRangePush("elpa1")
#endif
call obj%get("output_pinning_information", pinningInfo, error)
if (error .ne. ELPA_OK) then
print *,"Problem setting option for debug. Aborting..."
stop
endif
if (pinningInfo .eq. 1) then
call init_thread_affinity(nrThreads)
call check_thread_affinity()
if (my_pe .eq. 0) call print_thread_affinity(my_pe)
call cleanup_thread_affinity()
endif
success = .true.
#ifdef REDISTRIBUTE_MATRIX
......@@ -619,7 +633,6 @@ function elpa_solve_evp_&
call blacs_gridexit(blacs_ctxt_)
endif
#endif /* REDISTRIBUTE_MATRIX */
call obj%timer%stop("elpa_solve_evp_&
&MATH_DATATYPE&
&_1stage_&
......
......@@ -581,8 +581,17 @@ subroutine tridiag_&
#ifdef WITH_OPENMP_TRADITIONAL
call obj%timer%start("OpenMP parallel")
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
!todo : check whether GPU implementation with large matrix multiply is beneficial
! for a larger number of threads; could be addressed with autotuning if this
! is the case
!$omp parallel &
!$omp num_threads(max_threads) &
!$omp default(none) &
!$omp private(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end) &
!$omp shared(useGPU, isSkewsymmetric, cudaMemcpyDeviceToHost, successCuda, u_row, u_row_dev, &
!$omp & v_row, v_row_dev, v_col, v_col_dev, u_col, u_col_dev, a_dev, a_offset, &
!$omp& max_local_cols, max_local_rows, obj, wantDebug, l_rows_per_tile, l_cols_per_tile, &
!$omp& matrixRows, istep, tile_size, l_rows, l_cols, ur_p, uc_p, a_mat)
my_thread = omp_get_thread_num()
n_threads = omp_get_num_threads()
......
......@@ -93,7 +93,7 @@
matrixRows = obj%local_nrows
matrixCols = obj%local_ncols
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
omp_threads_caller = omp_get_max_threads()
......@@ -141,7 +141,7 @@
! restore original OpenMP settings
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call omp_set_num_threads(omp_threads_caller)
......
......@@ -693,7 +693,11 @@ max_threads)
!This does not help performance due to the addition of two openmp barriers around the MPI call,
!But in the future this may be beneficial if these barriers are replaced with a faster implementation
!$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
!$omp parallel &
!$omp default(none) &
!$omp shared(lc, istep, nbw, my_pcol, np_cols, nblk, &
!$omp& lr, vr, a_mat, transformChunkSize, tau, aux1, aux2, wantDebug, mpi_comm_rows, obj) &
!$omp private(mynlc, j, lcx, ii, pp, mpierr )
mynlc = 0 ! number of local columns
!This loop does not have independent iterations,
......@@ -941,22 +945,20 @@ max_threads)
n_way = 1
#ifdef WITH_OPENMP_TRADITIONAL
#if REALCASE == 1
n_way = max_threads
!$omp parallel private( i,lcs,lce,lrs,lre)
#endif
if (n_way > 1) then
#if REALCASE == 1
!$omp do
#endif
!$omp parallel do &
!$omp default(none) &
!$omp private(i) &
!$omp shared(l_cols_tile, l_cols, umcCPU, n_cols)
do i=1,min(l_cols_tile, l_cols)
umcCPU(i,1:n_cols) = 0.0_rck
enddo
#if REALCASE == 1
!$omp do
#endif
!$omp parallel do &
!$omp default(none) &
!$omp private(i) &
!$omp shared(l_rows, vmrCPU, n_cols)
do i=1,l_rows
vmrCPU(i,n_cols+1:2*n_cols) = 0.0_rck
enddo
......@@ -977,9 +979,11 @@ max_threads)
!This algorithm chosen because in this algoirhtm, the loop around the dgemm calls
!is easily parallelized, and regardless of choise of algorithm,
!the startup cost for parallelizing the dgemms inside the loop is too great
#if REALCASE == 1
!$omp do schedule(static,1)
#endif
!$omp parallel do schedule(static,1) &
!$omp default(none) &
!$omp private(i, lcs, lce, lrs, lre) &
!$omp shared(istep, nbw, tile_size, obj, l_cols, l_cols_tile, l_rows, isSkewsymmetric, &
!$omp& n_cols, l_rows_tile, umcCPU, vmrCPU, a_mat)
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1 ! local column start
lce = min(l_cols, (i+1)*l_cols_tile) ! local column end
......@@ -1139,9 +1143,6 @@ max_threads)
#ifdef WITH_OPENMP_TRADITIONAL
endif ! n_way > 1
#if REALCASE == 1
!$omp end parallel
#endif
#endif
! Sum up all ur(:) parts along rows and add them to the uc(:) parts
! on the processors containing the diagonal
......
......@@ -89,6 +89,7 @@
use elpa_scalapack_interfaces
#endif
use solve_tridi
use thread_affinity
use, intrinsic :: iso_c_binding
implicit none
#include "../general/precision_kinds.F90"
......@@ -200,6 +201,7 @@
#endif
integer(kind=ik) :: global_index
logical :: reDistributeMatrix, doRedistributeMatrix
integer(kind=ik) :: pinningInfo
#if REALCASE == 1
#undef GPU_KERNEL
......@@ -238,6 +240,20 @@
nrThreads = 1
#endif
call obj%get("output_pinning_information", pinningInfo, error)
if (error .ne. ELPA_OK) then
print *,"Problem setting option for debug. Aborting..."
stop
endif
if (pinningInfo .eq. 1) then
call init_thread_affinity(nrThreads)
call check_thread_affinity()
if (my_pe .eq. 0) call print_thread_affinity(my_pe)
call cleanup_thread_affinity()
endif
success = .true.
#ifdef REDISTRIBUTE_MATRIX
......
......@@ -271,6 +271,7 @@ static const elpa_index_int_entry_t int_entries[] = {
BOOL_ENTRY("print_flops", "Print FLOP rates on task 0", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
BOOL_ENTRY("measure_performance", "Also measure with flops (via papi) with the timings", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
BOOL_ENTRY("check_pd", "Check eigenvalues to be positive", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
BOOL_ENTRY("output_pinning_information", "Print the pinning information", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
BOOL_ENTRY("cannon_for_generalized", "Whether to use Cannons algorithm for the generalized EVP", 1, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
};
......
// Copyright 2021, A. Marek
//
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek, MPCDF
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <sched.h>
#include <sys/types.h>
#include <unistd.h>
void get_thread_affinity(int *cpu_id) {
*cpu_id = sched_getcpu();
}
void get_process_affinity(int cpu_id) {
cpu_set_t set;
int ret, i;
int cpu;
cpu_id = 9999999 ;
ret = sched_getaffinity(0, sizeof(cpu_set_t), &set);
for (i=0; i < CPU_SETSIZE; i++)
{
cpu = CPU_ISSET(i, &set);
if (cpu == 1) { cpu_id = i; }
}
}
void get_process_id(int *process_id, int *pprocess_id) {
int id;
*process_id = 0;
*pprocess_id = 0;
id = getpid();
*process_id = id ;
id = getppid();
*pprocess_id = id ;
//printf("My pid %d \n",*process_id);
//printf("My ppid %d \n",*pprocess_id);
}
! Copyright 2021, A. Marek
!
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! This particular source code file contains additions, changes and
! enhancements authored by Intel Corporation which is not part of
! the ELPA consortium.
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
! Author: Andreas Marek, MPCDF
#include "config-f90.h"
module thread_affinity
use precision
implicit none
public :: check_thread_affinity, &
init_thread_affinity, cleanup_thread_affinity, print_thread_affinity
private
! integer(kind=ik) :: thread_num
integer(kind=ik) :: thread_max
integer(kind=ik) :: process_cpu_id
integer(kind=ik), allocatable :: cpu_ids(:)
interface
subroutine get_process_id_c(process_id, pprocess_id) bind(C, name="get_process_id")
use, intrinsic :: iso_c_binding
implicit none
integer(kind=C_INT), intent(out) :: process_id, pprocess_id
end subroutine
end interface
interface
subroutine get_thread_affinity_c(cpu_id) bind(C, name="get_thread_affinity")
use, intrinsic :: iso_c_binding
implicit none
integer(kind=C_INT) :: cpu_id
end subroutine
end interface
interface
subroutine get_process_affinity_c(cpu_id) bind(C, name="get_process_affinity")
use, intrinsic :: iso_c_binding
implicit none
integer(kind=C_INT), value :: cpu_id
end subroutine
end interface
contains
subroutine get_thread_affinity(cpu_id)
use, intrinsic :: iso_c_binding
implicit none
integer(kind=ik), intent(out) :: cpu_id
integer(kind=C_INT) :: cpu_id_c
call get_thread_affinity_c(cpu_id_c)
cpu_id = int(cpu_id_c, kind=ik)
end subroutine
subroutine get_process_affinity(cpu_id)
use, intrinsic :: iso_c_binding
implicit none
integer(kind=ik), intent(out) :: cpu_id
integer(kind=C_INT) :: cpu_id_c
call get_process_affinity_c(cpu_id_c)
cpu_id = int(cpu_id_c, kind=ik)
end subroutine
subroutine get_process_id(process_id, pprocess_id)
use, intrinsic :: iso_c_binding
implicit none
integer(kind=ik), intent(out) :: process_id, pprocess_id
integer(kind=C_INT) :: process_id_c, pprocess_id_c
call get_process_id_c(process_id_c, pprocess_id_c)
process_id = int(process_id_c, kind=ik)
pprocess_id = int(pprocess_id_c, kind=ik)
end subroutine
subroutine init_thread_affinity(nrThreads)
use precision
use omp_lib
implicit none
integer(kind=ik) :: istat
integer(kind=ik), intent(in) :: nrThreads
thread_max = nrThreads
#ifdef WITH_OPENMP_TRADITIONAL
if(.not.(allocated(cpu_ids))) then
allocate(cpu_ids(0:thread_max-1), stat=istat)
if (istat .ne. 0) then
print *,"Error when allocating init_thread_affinity"
endif
endif
#endif
end subroutine init_thread_affinity
subroutine cleanup_thread_affinity
use precision
implicit none
integer(kind=ik) :: istat
if((allocated(cpu_ids))) then
deallocate(cpu_ids, stat=istat)
if (istat .ne. 0) then
print *,"Error when deallocating init_thread_affinity"
endif
endif
end subroutine cleanup_thread_affinity
subroutine check_thread_affinity()
use precision
use omp_lib
implicit none
integer(kind=ik) :: thread_cpu_id
integer(kind=ik) :: i, actuall_num
call get_process_affinity(process_cpu_id)
#ifdef WITH_OPENMP_TRADITIONAL