Commit b7e7b791 authored by Andreas Marek's avatar Andreas Marek
Browse files

Move auxiliary functions in new module and make them public again

These functions were made private with ELPA releases 2016.05.001
and 2016.05.002, but they should be public
parent b2f9399a
......@@ -13,6 +13,7 @@ libelpa@SUFFIX@_public_la_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@modules @FC_MODINC@
libelpa@SUFFIX@_public_la_SOURCES = \
src/elpa1.F90 \
src/elpa2.F90 \
src/elpa1_auxiliary.F90 \
src/elpa2_utilities.F90 \
src/elpa_utilities.F90
......@@ -28,7 +29,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/mod_compute_hh_trafo_complex.F90 \
src/mod_pack_unpack_complex.F90 \
src/aligned_mem.F90 \
src/elpa1_compute.F90 \
src/elpa1_compute_private.F90 \
src/elpa2_compute.F90 \
src/elpa2_kernels/mod_fortran_interfaces.F90 \
src/elpa2_kernels/mod_single_hh_trafo_real.F90 \
......
......@@ -83,6 +83,7 @@
module ELPA1
use precision
use elpa_utilities
use elpa1_auxiliary
implicit none
......
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
! This particular source code file contains additions, changes and
! enhancements authored by Intel Corporation which is not part of
! the ELPA consortium.
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#include "config-f90.h"
module elpa1_auxiliary
implicit none
PRIVATE ! set default to private
public :: mult_at_b_real !< Multiply real matrices A**T * B
public :: mult_ah_b_complex !< Multiply complex matrices A**H * B
public :: invert_trm_real !< Invert real triangular matrix
public :: invert_trm_complex !< Invert complex triangular matrix
public :: cholesky_real !< Cholesky factorization of a real matrix
public :: cholesky_complex !< Cholesky factorization of a complex matrix
contains
!> \brief cholesky_real: Cholesky factorization of a real symmetric matrix
!> \detail
!>
!> \param na Order of matrix
!> \param a(lda,matrixCols) Distributed matrix which should be factorized.
!> Distribution is like in Scalapack.
!> Only upper triangle is needs to be set.
!> On return, the upper triangle contains the Cholesky factor
!> and the lower triangle is set to 0.
!> \param lda Leading dimension of a
!> \param matrixCols local columns of matrix a
!> \param nblk blocksize of cyclic distribution, must be the same in both directions!
!> \param mpi_comm_rows MPI communicator for rows
!> \param mpi_comm_cols MPI communicator for columns
!> \param wantDebug logical, more debug information on failure
!> \param succes logical, reports success or failure
subroutine cholesky_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
use elpa1_compute
use elpa_utilities
use elpa_mpi
implicit none
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=rk) :: a(lda,matrixCols)
! was
! real a(lda, *)
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
integer(kind=ik) :: n, nc, i, info
integer(kind=ik) :: lcs, lce, lrs, lre
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
real(kind=rk), allocatable :: tmp1(:), tmp2(:,:), tmatr(:,:), tmatc(:,:)
logical, intent(in) :: wantDebug
logical, intent(out) :: success
integer(kind=ik) :: istat
character(200) :: errorMessage
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("cholesky_real")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
success = .true.
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
l_rows_tile = tile_size/np_rows ! local rows of a tile
l_cols_tile = tile_size/np_cols ! local cols of a tile
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"cholesky_real: error when allocating tmp1 "//errorMessage
stop
endif
allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"cholesky_real: error when allocating tmp2 "//errorMessage
stop
endif
tmp1 = 0
tmp2 = 0
allocate(tmatr(l_rows,nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"cholesky_real: error when allocating tmatr "//errorMessage
stop
endif
allocate(tmatc(l_cols,nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"cholesky_real: error when allocating tmatc "//errorMessage
stop
endif
tmatr = 0
tmatc = 0
do n = 1, na, nblk
! Calculate first local row and column of the still remaining matrix
! on the local processor
l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)
l_rowx = local_index(n+nblk, my_prow, np_rows, nblk, +1)
l_colx = local_index(n+nblk, my_pcol, np_cols, nblk, +1)
if (n+nblk > na) then
! This is the last step, just do a Cholesky-Factorization
! of the remaining block
if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then
call dpotrf('U',na-n+1,a(l_row1,l_col1),lda,info)
if (info/=0) then
if (wantDebug) write(error_unit,*) "ELPA1_cholesky_real: Error in dpotrf"
success = .false.
return
endif
endif
exit ! Loop
endif
if (my_prow==prow(n, nblk, np_rows)) then
if (my_pcol==pcol(n, nblk, np_cols)) then
! The process owning the upper left remaining block does the
! Cholesky-Factorization of this block
call dpotrf('U',nblk,a(l_row1,l_col1),lda,info)
if (info/=0) then
if (wantDebug) write(error_unit,*) "ELPA1_cholesky_real: Error in dpotrf"
success = .false.
return
endif
nc = 0
do i=1,nblk
tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1)
nc = nc+i
enddo
endif
#ifdef WITH_MPI
call MPI_Bcast(tmp1,nblk*(nblk+1)/2,MPI_REAL8,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr)
#endif
nc = 0
do i=1,nblk
tmp2(1:i,i) = tmp1(nc+1:nc+i)
nc = nc+i
enddo
if (l_cols-l_colx+1>0) &
call dtrsm('L','U','T','N',nblk,l_cols-l_colx+1,1.d0,tmp2,ubound(tmp2,dim=1),a(l_row1,l_colx),lda)
endif
do i=1,nblk
if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = a(l_row1+i-1,l_colx:l_cols)
#ifdef WITH_MPI
if (l_cols-l_colx+1>0) &
call MPI_Bcast(tmatc(l_colx,i),l_cols-l_colx+1,MPI_REAL8,prow(n, nblk, np_rows),mpi_comm_rows,mpierr)
#endif
enddo
! this has to be checked since it was changed substantially when doing type safe
call elpa_transpose_vectors_real (tmatc, ubound(tmatc,dim=1), mpi_comm_cols, &
tmatr, ubound(tmatr,dim=1), mpi_comm_rows, &
n, na, nblk, nblk)
do i=0,(na-1)/tile_size
lcs = max(l_colx,i*l_cols_tile+1)
lce = min(l_cols,(i+1)*l_cols_tile)
lrs = l_rowx
lre = min(l_rows,(i+1)*l_rows_tile)
if (lce<lcs .or. lre<lrs) cycle
call DGEMM('N','T',lre-lrs+1,lce-lcs+1,nblk,-1.d0, &
tmatr(lrs,1),ubound(tmatr,dim=1),tmatc(lcs,1),ubound(tmatc,dim=1), &
1.d0,a(lrs,lcs),lda)
enddo
enddo
deallocate(tmp1, tmp2, tmatr, tmatc, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"cholesky_real: error when deallocating tmp1 "//errorMessage
stop
endif
! Set the lower triangle to 0, it contains garbage (form the above matrix multiplications)
do i=1,na
if (my_pcol==pcol(i, nblk, np_cols)) then
! column i is on local processor
l_col1 = local_index(i , my_pcol, np_cols, nblk, +1) ! local column number
l_row1 = local_index(i+1, my_prow, np_rows, nblk, +1) ! first row below diagonal
a(l_row1:l_rows,l_col1) = 0
endif
enddo
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("cholesky_real")
#endif
end subroutine cholesky_real
!> \brief invert_trm_real: Inverts a upper triangular matrix
!> \detail
!> \param na Order of matrix
!> \param a(lda,matrixCols) Distributed matrix which should be inverted
!> Distribution is like in Scalapack.
!> Only upper triangle is needs to be set.
!> The lower triangle is not referenced.
!> \param lda Leading dimension of a
!> \param matrixCols local columns of matrix a
!> \param nblk blocksize of cyclic distribution, must be the same in both directions!
!> \param mpi_comm_rows MPI communicator for rows
!> \param mpi_comm_cols MPI communicator for columns
!> \param wantDebug logical, more debug information on failure
!> \param succes logical, reports success or failure
subroutine invert_trm_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success)
use precision
use elpa1_compute
use elpa_utilities
use elpa_mpi
implicit none
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
real(kind=rk) :: a(lda,*)
#else
real(kind=rk) :: a(lda,matrixCols)
#endif
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
integer(kind=ik) :: n, nc, i, info, ns, nb
real(kind=rk), allocatable :: tmp1(:), tmp2(:,:), tmat1(:,:), tmat2(:,:)
logical, intent(in) :: wantDebug
logical, intent(out) :: success
integer(kind=ik) :: istat
character(200) :: errorMessage
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
success = .true.
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"invert_trm_real: error when allocating tmp1 "//errorMessage
stop
endif
allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"invert_trm_real: error when allocating tmp2 "//errorMessage
stop
endif
tmp1 = 0
tmp2 = 0
allocate(tmat1(l_rows,nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"invert_trm_real: error when allocating tmat1 "//errorMessage
stop
endif
allocate(tmat2(nblk,l_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"invert_trm_real: error when allocating tmat2 "//errorMessage
stop
endif
tmat1 = 0
tmat2 = 0
ns = ((na-1)/nblk)*nblk + 1
do n = ns,1,-nblk
l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)
nb = nblk
if (na-n+1 < nblk) nb = na-n+1
l_rowx = local_index(n+nb, my_prow, np_rows, nblk, +1)
l_colx = local_index(n+nb, my_pcol, np_cols, nblk, +1)
if (my_prow==prow(n, nblk, np_rows)) then
if (my_pcol==pcol(n, nblk, np_cols)) then
call DTRTRI('U','N',nb,a(l_row1,l_col1),lda,info)
if (info/=0) then
if (wantDebug) write(error_unit,*) "ELPA1_invert_trm_real: Error in DTRTRI"
success = .false.
return
endif
nc = 0
do i=1,nb
tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1)
nc = nc+i
enddo
endif
#ifdef WITH_MPI
call MPI_Bcast(tmp1,nb*(nb+1)/2,MPI_REAL8,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr)
#endif
nc = 0
do i=1,nb
tmp2(1:i,i) = tmp1(nc+1:nc+i)
nc = nc+i
enddo
if (l_cols-l_colx+1>0) &
call DTRMM('L','U','N','N',nb,l_cols-l_colx+1,1.d0,tmp2,ubound(tmp2,dim=1),a(l_row1,l_colx),lda)
if (l_colx<=l_cols) tmat2(1:nb,l_colx:l_cols) = a(l_row1:l_row1+nb-1,l_colx:l_cols)
if (my_pcol==pcol(n, nblk, np_cols)) tmat2(1:nb,l_col1:l_col1+nb-1) = tmp2(1:nb,1:nb) ! tmp2 has the lower left triangle 0
endif
if (l_row1>1) then
if (my_pcol==pcol(n, nblk, np_cols)) then
tmat1(1:l_row1-1,1:nb) = a(1:l_row1-1,l_col1:l_col1+nb-1)
a(1:l_row1-1,l_col1:l_col1+nb-1) = 0
endif
do i=1,nb
#ifdef WITH_MPI
call MPI_Bcast(tmat1(1,i),l_row1-1,MPI_REAL8,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr)
#endif
enddo
endif
#ifdef WITH_MPI
if (l_cols-l_col1+1>0) &
call MPI_Bcast(tmat2(1,l_col1),(l_cols-l_col1+1)*nblk,MPI_REAL8,prow(n, nblk, np_rows),mpi_comm_rows,mpierr)
#endif
if (l_row1>1 .and. l_cols-l_col1+1>0) &
call dgemm('N','N',l_row1-1,l_cols-l_col1+1,nb, -1.d0, &
tmat1,ubound(tmat1,dim=1),tmat2(1,l_col1),ubound(tmat2,dim=1), &
1.d0, a(1,l_col1),lda)
enddo
deallocate(tmp1, tmp2, tmat1, tmat2, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"invert_trm_real: error when deallocating tmp1 "//errorMessage
stop
endif
end subroutine invert_trm_real
!> \brief cholesky_complex: Cholesky factorization of a complex hermitian matrix
!> \detail
!> \param na Order of matrix
!> \param a(lda,matrixCols) Distributed matrix which should be factorized.
!> Distribution is like in Scalapack.
!> Only upper triangle is needs to be set.
!> On return, the upper triangle contains the Cholesky factor
!> and the lower triangle is set to 0.
!> \param lda Leading dimension of a
!> \param matrixCols local columns of matrix a
!> \param nblk blocksize of cyclic distribution, must be the same in both directions!
!> \param mpi_comm_rows MPI communicator for rows
!> \param mpi_comm_cols MPI communicator for columns
!> \param wantDebug logical, more debug information on failure
!> \param succes logical, reports success or failure
subroutine cholesky_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
use elpa1_compute
use elpa_utilities
use elpa_mpi
implicit none
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=ck) :: a(lda,*)
#else
complex(kind=ck) :: a(lda,matrixCols)
#endif
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
integer(kind=ik) :: n, nc, i, info
integer(kind=ik) :: lcs, lce, lrs, lre
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
complex(kind=ck), allocatable :: tmp1(:), tmp2(:,:), tmatr(:,:), tmatc(:,:)
logical, intent(in) :: wantDebug
logical, intent(out) :: success
integer(kind=ik) :: istat
character(200) :: errorMessage
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("cholesky_complex")
#endif
success = .true.
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
l_rows_tile = tile_size/np_rows ! local rows of a tile
l_cols_tile = tile_size/np_cols ! local cols of a tile
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"cholesky_complex: error when allocating tmp1 "//errorMessage
stop
endif
allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"cholesky_complex: error when allocating tmp2 "//errorMessage
stop
endif
tmp1 = 0
tmp2 = 0
allocate(tmatr(l_rows,nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"cholesky_complex: error when allocating tmatr "//errorMessage
stop
endif
allocate(tmatc(l_cols,nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"cholesky_complex: error when allocating tmatc "//errorMessage
stop
endif
tmatr = 0
tmatc = 0
do n = 1, na, nblk
! Calculate first local row and column of the still remaining matrix
! on the local processor
l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)
l_rowx = local_index(n+nblk, my_prow, np_rows, nblk, +1)
l_colx = local_index(n+nblk, my_pcol, np_cols, nblk, +1)
if (n+nblk > na) then
! This is the last step, just do a Cholesky-Factorization
! of the remaining block
if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then
call zpotrf('U',na-n+1,a(l_row1,l_col1),lda,info)
if (info/=0) then
if (wantDebug) write(error_unit,*) "ELPA1_cholesky_complex: Error in zpotrf"
success = .false.
return
endif
endif
exit ! Loop
endif
if (my_prow==prow(n, nblk, np_rows)) then
if (my_pcol==pcol(n, nblk, np_cols)) then
! The process owning the upper left remaining block does the
! Cholesky-Factorization of this block
call zpotrf('U',nblk,a(l_row1,l_col1),lda,info)
if (info/=0) then
if (wantDebug) write(error_unit,*) "ELPA1_cholesky_complex: Error in zpotrf"