Commit 2eb3a354 authored by Andreas Marek's avatar Andreas Marek
Browse files

Start to prepare ELPA2 for new interface

parent 1f343334
......@@ -14,6 +14,7 @@ libelpa@SUFFIX@_public_la_SOURCES = \
src/elpa.F90 \
src/elpa1/elpa1.F90 \
src/elpa2/elpa2.F90 \
src/elpa2/elpa2_new_interface.F90 \
src/elpa1/elpa1_auxiliary.F90 \
src/elpa1/elpa1_utilities.F90 \
src/elpa2/elpa2_utilities.F90 \
......@@ -55,6 +56,7 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2/elpa2_compute_complex_template.X90 \
src/elpa1/elpa1_template.X90 \
src/elpa2/elpa2_template.X90 \
src/elpa2/elpa2_template_new_interface.X90 \
src/elpa1_c_interface_template.X90 \
src/elpa2_c_interface_template.X90 \
src/elpa_driver_c_interface_template.X90 \
......@@ -1014,6 +1016,7 @@ EXTRA_DIST = \
src/elpa2/elpa2_symm_matrix_allreduce_real_template.X90 \
src/elpa1/elpa1_template.X90 \
src/elpa2/elpa2_template.X90 \
src/elpa2/elpa2_template_new_interface.X90 \
src/elpa1_c_interface_template.X90 \
src/elpa2_c_interface_template.X90 \
src/elpa_driver_c_interface_template.X90 \
......
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), fomerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
! This particular source code file contains additions, changes and
! enhancements authored by Intel Corporation which is not part of
! the ELPA consortium.
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
!
! Author: Andreas Marek, MPCDF
#include "config-f90.h"
!> \brief Fortran module which provides the routines to use the 2-stage ELPA solver
module ELPA2_new
! Version 1.1.2, 2011-02-21
use elpa_utilities
use elpa1, only : elpa_print_times, time_evp_back, time_evp_fwd, time_evp_solve
use elpa2_utilities
implicit none
PRIVATE ! By default, all routines contained are private
! The following routines are public:
public :: elpa_solve_evp_real_2stage_double_new !< Driver routine for real double-precision 2-stage eigenvalue problem
public :: elpa_solve_evp_complex_2stage_double_new !< Driver routine for complex double-precision 2-stage eigenvalue problem
contains
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "../precision_macros.h"
!-------------------------------------------------------------------------------
!> \brief elpasolve_evp_real_2stage_double_new: Fortran function to solve the double-precision real eigenvalue problem with a 2 stage approach
!>
!> Parameters
!>
!> \param na Order of matrix a
!>
!> \param nev Number of eigenvalues needed
!>
!> \param a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed.
!> Distribution is like in Scalapack.
!> The full matrix must be set (not only one half like in scalapack).
!> Destroyed on exit (upper and lower half).
!>
!> \param lda Leading dimension of a
!>
!> \param ev(na) On output: eigenvalues of a, every processor gets the complete set
!>
!> \param q(ldq,matrixCols) On output: Eigenvectors of a
!> Distribution is like in Scalapack.
!> Must be always dimensioned to the full size (corresponding to (na,na))
!> even if only a part of the eigenvalues is needed.
!>
!> \param ldq Leading dimension of q
!>
!> \param nblk blocksize of cyclic distribution, must be the same in both directions!
!>
!> \param matrixCols local columns of matrix a and q
!>
!> \param mpi_comm_rows MPI communicator for rows
!> \param mpi_comm_cols MPI communicator for columns
!> \param mpi_comm_all MPI communicator for the total processor set
!>
!> \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
!> \param useQR (optional) use QR decomposition
!> \param useGPU (optional) decide whether to use GPUs or not
!>
!> \result success logical, false if error occured
!-------------------------------------------------------------------------------
#include "elpa2_template_new_interface.X90"
#undef REALCASE
#undef DOUBLE_PRECISION
#ifdef WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "../precision_macros.h"
!-------------------------------------------------------------------------------
!> \brief elpa_solve_evp_real_2stage_single_new: Fortran function to solve the single-precision real eigenvalue problem with a 2 stage approach
!>
!> Parameters
!>
!> \param na Order of matrix a
!>
!> \param nev Number of eigenvalues needed
!>
!> \param a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed.
!> Distribution is like in Scalapack.
!> The full matrix must be set (not only one half like in scalapack).
!> Destroyed on exit (upper and lower half).
!>
!> \param lda Leading dimension of a
!>
!> \param ev(na) On output: eigenvalues of a, every processor gets the complete set
!>
!> \param q(ldq,matrixCols) On output: Eigenvectors of a
!> Distribution is like in Scalapack.
!> Must be always dimensioned to the full size (corresponding to (na,na))
!> even if only a part of the eigenvalues is needed.
!>
!> \param ldq Leading dimension of q
!>
!> \param nblk blocksize of cyclic distribution, must be the same in both directions!
!>
!> \param matrixCols local columns of matrix a and q
!>
!> \param mpi_comm_rows MPI communicator for rows
!> \param mpi_comm_cols MPI communicator for columns
!> \param mpi_comm_all MPI communicator for the total processor set
!>
!> \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
!> \param useQR (optional) use QR decomposition
!> \param useGPU (optional) decide whether GPUs should be used or not
!>
!> \result success logical, false if error occured
!-------------------------------------------------------------------------------
#include "elpa2_template_new_interface.X90"
#undef REALCASE
#undef SINGLE_PRECISION
#endif /* WANT_SINGLE_PRECISION_REAL */
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "../precision_macros.h"
!> \brief elpa_solve_evp_complex_2stage_double_new: Fortran function to solve the double-precision complex eigenvalue problem with a 2 stage approach
!>
!> Parameters
!>
!> \param na Order of matrix a
!>
!> \param nev Number of eigenvalues needed
!>
!> \param a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed.
!> Distribution is like in Scalapack.
!> The full matrix must be set (not only one half like in scalapack).
!> Destroyed on exit (upper and lower half).
!>
!> \param lda Leading dimension of a
!>
!> \param ev(na) On output: eigenvalues of a, every processor gets the complete set
!>
!> \param q(ldq,matrixCols) On output: Eigenvectors of a
!> Distribution is like in Scalapack.
!> Must be always dimensioned to the full size (corresponding to (na,na))
!> even if only a part of the eigenvalues is needed.
!>
!> \param ldq Leading dimension of q
!>
!> \param nblk blocksize of cyclic distribution, must be the same in both directions!
!>
!> \param matrixCols local columns of matrix a and q
!>
!> \param mpi_comm_rows MPI communicator for rows
!> \param mpi_comm_cols MPI communicator for columns
!> \param mpi_comm_all MPI communicator for the total processor set
!>
!> \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!> \param useGPU (optional) decide whether GPUs should be used or not
!>
!> \result success logical, false if error occured
!-------------------------------------------------------------------------------
#include "elpa2_template_new_interface.X90"
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "../precision_macros.h"
!> \brief elpa_solve_evp_complex_2stage_single_new: Fortran function to solve the single-precision complex eigenvalue problem with a 2 stage approach
!>
!> Parameters
!>
!> \param na Order of matrix a
!>
!> \param nev Number of eigenvalues needed
!>
!> \param a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed.
!> Distribution is like in Scalapack.
!> The full matrix must be set (not only one half like in scalapack).
!> Destroyed on exit (upper and lower half).
!>
!> \param lda Leading dimension of a
!>
!> \param ev(na) On output: eigenvalues of a, every processor gets the complete set
!>
!> \param q(ldq,matrixCols) On output: Eigenvectors of a
!> Distribution is like in Scalapack.
!> Must be always dimensioned to the full size (corresponding to (na,na))
!> even if only a part of the eigenvalues is needed.
!>
!> \param ldq Leading dimension of q
!>
!> \param nblk blocksize of cyclic distribution, must be the same in both directions!
!>
!> \param matrixCols local columns of matrix a and q
!>
!> \param mpi_comm_rows MPI communicator for rows
!> \param mpi_comm_cols MPI communicator for columns
!> \param mpi_comm_all MPI communicator for the total processor set
!>
!> \param THIS_COMPLEX_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!> \param useGPU (optional) decide whether GPUs should be used or not
!>
!> \result success logical, false if error occured
!-------------------------------------------------------------------------------
#include "elpa2_template_new_interface.X90"
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#endif /* WANT_SINGLE_PRECISION_COMPLEX */
end module ELPA2_new
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
! This particular source code file contains additions, changes and
! enhancements authored by Intel Corporation which is not part of
! the ELPA consortium.
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
function elpa_solve_evp_&
&MATH_DATATYPE&
&_&
&2stage_&
&PRECISION&
&_new (na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
#if REALCASE == 1
THIS_ELPA_KERNEL_API, useQR, &
#endif
#if COMPLEXCASE == 1
THIS_ELPA_KERNEL_API, &
#endif
useGPU) result(success)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use elpa1_utilities, only : gpu_usage_via_environment_variable
use elpa1_compute
use elpa2_compute
use elpa_mpi
use cuda_functions
use mod_check_for_gpu
use iso_c_binding
implicit none
logical, intent(in), optional :: useGPU
#if REALCASE == 1
logical, intent(in), optional :: useQR
#endif
logical :: useQRActual, useQREnvironment
integer(kind=c_int) :: bandwidth
integer(kind=c_int), intent(in), optional :: THIS_ELPA_KERNEL_API
integer(kind=c_int) :: THIS_ELPA_KERNEL
integer(kind=c_int), intent(in) :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, &
mpi_comm_cols, mpi_comm_all
integer(kind=c_int), intent(in) :: nblk
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(lda,*), q(ldq,*)
#else
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(lda,matrixCols), q(ldq,matrixCols)
#endif
real(kind=C_DATATYPE_KIND), intent(inout) :: ev(na)
MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: hh_trans(:,:)
integer(kind=c_int) :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=c_int) :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: tmat(:,:,:)
real(kind=C_DATATYPE_KIND), allocatable :: e(:)
#if COMPLEXCASE == 1
real(kind=C_DATATYPE_KIND), allocatable :: q_real(:,:)
#endif
integer(kind=c_intptr_t) :: tmat_dev, q_dev, a_dev
real(kind=c_double) :: ttt0, ttt1, ttts ! MPI_WTIME always needs double
integer(kind=c_int) :: i
logical :: success, successCUDA
logical, save :: firstCall = .true.
logical :: wantDebug
integer(kind=c_int) :: istat
character(200) :: errorMessage
logical :: do_useGPU, do_useGPU_trans_ev_tridi
integer(kind=c_int) :: numberOfGPUDevices
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
call timer%start("solve_evp_&
&MATH_DATATYPE&
&_2stage" // &
&PRECISION_SUFFIX &
)
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
call timer%stop("mpi_communication")
wantDebug = .false.
if (firstCall) then
! are debug messages desired?
wantDebug = debug_messages_via_environment_variable()
firstCall = .false.
endif
success = .true.
do_useGPU = .false.
do_useGPU_trans_ev_tridi =.false.
#if REALCASE == 1
useQRActual = .false.
! set usage of qr decomposition via API call
if (present(useQR)) then
if (useQR) useQRActual = .true.
if (.not.(useQR)) useQRACtual = .false.
endif
! overwrite this with environment variable settings
if (qr_decomposition_via_environment_variable(useQREnvironment)) then
useQRActual = useQREnvironment
endif
if (useQRActual) then
if (mod(na,2) .ne. 0) then
if (wantDebug) then
write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
endif
print *, "Do not use QR-decomposition for this matrix and blocksize."
success = .false.
return
endif
endif
#endif /* REALCASE */
if (present(useGPU)) then
if (useGPU) then
if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
do_useGPU = .true.
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
cudaHostRegisterPortable = cuda_hostRegisterPortable()
cudaHostRegisterMapped = cuda_hostRegisterMapped()
else
print *,"GPUs are requested but not detected! Aborting..."
success = .false.
return
endif
endif
else
! check whether set by environment variable
do_useGPU = gpu_usage_via_environment_variable()
endif
if (present(THIS_ELPA_KERNEL_API)) then
! user defined kernel via the optional argument in the API call
THIS_ELPA_KERNEL = THIS_ELPA_KERNEL_API
else
! if kernel is not choosen via api
! check whether set by environment variable
THIS_ELPA_KERNEL = elpa_get_actual_&
&MATH_DATATYPE&
&_kernel()
endif
! check whether choosen kernel is allowed: function returns true if NOT allowed! change this
if (check_allowed_&
&MATH_DATATYPE&
&_kernels(THIS_ELPA_KERNEL)) then
if (my_pe == 0) then
write(error_unit,*) " "
write(error_unit,*) "The choosen kernel ", &
&MATH_DATATYPE&
&_ELPA_KERNEL_NAMES(THIS_ELPA_KERNEL)
write(error_unit,*) "is not in the list of the allowed kernels!"
write(error_unit,*) " "
write(error_unit,*) "Allowed kernels are:"
do i=1,size( &
&MATH_DATATYPE&
&_ELPA_KERNEL_NAMES(:))
if (AVAILABLE_&
&MATH_DATATYPE&
&_ELPA_KERNELS(i) .ne. 0) then
write(error_unit,*) &
#if REALCASE == 1
REAL_ELPA_KERNEL_NAMES(i)
#endif
#if COMPLEXCASE == 1
COMPLEX_ELPA_KERNEL_NAMES(i)
#endif
endif
enddo
write(error_unit,*) " "
! check whether generic kernel is defined
if (AVAILABLE_&
&MATH_DATATYPE&
&_ELPA_KERNELS( &
#if REALCASE == 1
REAL_ELPA_KERNEL_GENERIC ) .eq. 1) then
#endif
#if COMPLEXCASE == 1
COMPLEX_ELPA_KERNEL_GENERIC ) .eq. 1) then
#endif
write(error_unit,*) "The default kernel &
&MATH_DATATYPE&
&( &
&_ELPA_KERNEL_GENERIC will be used !"
else
write(error_unit,*) "As default kernel ", &
&MATH_DATATYPE&
&_ELPA_KERNEL_NAMES(DEFAULT_&
&MATH_DATATYPE&
&_ELPA_KERNEL)," will be used"
endif
endif ! my_pe == 0
if (AVAILABLE_&
&MATH_DATATYPE&
&_ELPA_KERNELS( &
&MATH_DATATYPE&
&_ELPA_KERNEL_GENERIC) .eq. 1) then
THIS_ELPA_KERNEL = &
&MATH_DATATYPE&
&_ELPA_KERNEL_GENERIC
else
THIS_ELPA_KERNEL = DEFAULT_&
&MATH_DATATYPE&
&_ELPA_KERNEL
endif
endif
! check consistency between request for GPUs and defined kernel
if (do_useGPU) then
do_useGPU_trans_ev_tridi = .true.
if (THIS_ELPA_KERNEL .ne. &
&MATH_DATATYPE&
&_ELPA_KERNEL_GPU) then
write(error_unit,*) "GPU usage has been requested but compute kernel is defined as non-GPU! Aborting..."
success = .false.
return
endif
endif
! if (do_useGPU) then
! if (nblk .ne. 128) then
! write(error_unit,*) "In case of GPU usage the blocksize for ELPA 2stage has to be 128"
! success = .false.
! return
! endif
! endif
if (do_useGPU) then
if (nblk .ne. 128) then
! cannot run on GPU with this blocksize
! disable GPU usage for trans_ev_tridi
do_useGPU_trans_ev_tridi = .false.
THIS_ELPA_KERNEL = MATH_DATATYPE&
&_ELPA_KERNEL_GENERIC
! no data transfer to GPU needed
endif
endif
bandwidth = -1
if (bandwidth .ne. -1) then
nbw = bandwidth
if ((nbw == 0) .or. (mod(nbw, nblk) .ne. 0)) then
if (wantDebug) then
write(error_unit,*) "Specified bandwidth has to be a multiple of blocksize"
endif
print *, "Specified bandwidth has to be a multiple of blocksize"
success = .false.
return
endif
ttts = MPI_Wtime()
else
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
! on this and maybe allow a run-time optimization here
if (do_useGPU) then
nbw = nblk
else
#if REALCASE == 1
nbw = (63/nblk+1)*nblk
#elif COMPLEXCASE == 1
nbw = (31/nblk+1)*nblk
#endif
endif
num_blocks = (na-1)/nbw + 1
allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then