Commit 2998fac3 authored by Andreas Marek's avatar Andreas Marek

Split file elpa2.F90 into elpa2.F90 and elpa2_compute.F90

As in a previous commit for elpa1.F90, for automatic generation of
documentation elpa2.F90 has been splitted in two files, in order to
have a lean, easy-to-understand user interface:

elpa2.F90
the visible user functions, which provide the library calls.
The usage is the same as before

elpa2_compute.F90
all internal routines, which are used by ELPA2, but which are never
called external of the library by a user. These functions are now
"hidden" in the module elpa2_compute, which is used by ELPA2.

The procedures in elpa2_compute.F90 are identical to the ones in
elpa2.F90 before this split commit. The only -- but quite a lot of them
-- changes are intendation changes.
parent 9710bf08
......@@ -13,6 +13,7 @@ libelpa@SUFFIX@_la_SOURCES = src/elpa_utilities.F90 \
src/elpa1_compute.F90 \
src/elpa1.F90 \
src/elpa2_utilities.F90 \
src/elpa2_compute.F90 \
src/elpa2.F90 \
src/elpa_c_interface.F90 \
src/elpa_qr/qr_utils.f90 \
......
......@@ -66,9 +66,10 @@ module ELPA2
! Version 1.1.2, 2011-02-21
use elpa_utilities
USE elpa1_compute
use elpa1_compute
use elpa1, only : elpa_print_times, time_evp_back, time_evp_fwd, time_evp_solve
use elpa2_utilities
use elpa2_compute
use elpa_pdgeqrf
implicit none
......@@ -80,34 +81,7 @@ module ELPA2
public :: solve_evp_real_2stage
public :: solve_evp_complex_2stage
public :: bandred_real
public :: tridiag_band_real
public :: trans_ev_tridi_to_band_real
public :: trans_ev_band_to_full_real
public :: bandred_complex
public :: tridiag_band_complex
public :: trans_ev_tridi_to_band_complex
public :: trans_ev_band_to_full_complex
public :: band_band_real
public :: divide_band
integer, public :: which_qr_decomposition = 1 ! defines, which QR-decomposition algorithm will be used
! 0 for unblocked
! 1 for blocked (maxrank: nblk)
!-------------------------------------------------------------------------------
! The following array contains the Householder vectors of the
! transformation band -> tridiagonal.
! It is allocated and set in tridiag_band_real and used in
! trans_ev_tridi_to_band_real.
! It must be deallocated by the user after trans_ev_tridi_to_band_real!
! real*8, allocatable :: hh_trans_real(:,:)
! complex*16, allocatable :: hh_trans_complex(:,:)
!-------------------------------------------------------------------------------
include 'mpif.h'
......@@ -556,5643 +530,4 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
end function solve_evp_complex_2stage
!-------------------------------------------------------------------------------
subroutine bandred_real(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
tmat, wantDebug, success, useQR)
!-------------------------------------------------------------------------------
! bandred_real: Reduces a distributed symmetric matrix to band form
!
! Parameters
!
! na Order of matrix
!
! a(lda,matrixCols) Distributed matrix which should be reduced.
! Distribution is like in Scalapack.
! Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
! a(:,:) is overwritten on exit with the band and the Householder vectors
! in the upper half.
!
! lda Leading dimension of a
! matrixCols local columns of matrix a
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! nbw semi bandwith of output matrix
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
!
! tmat(nbw,nbw,numBlocks) where numBlocks = (na-1)/nbw + 1
! Factors for the Householder vectors (returned), needed for back transformation
!
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
#ifdef WITH_OPENMP
use omp_lib
#endif
implicit none
integer :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
real*8 :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
integer :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer :: l_cols, l_rows
integer :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow
integer :: istep, ncol, lch, lcx, nlc, mynlc
integer :: tile_size, l_rows_tile, l_cols_tile
real*8 :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
real*8, allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:)
! needed for blocked QR decomposition
integer :: PQRPARAM(11), work_size
real*8 :: dwork_size(1)
real*8, allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)
logical, intent(in) :: wantDebug
logical, intent(out):: success
logical, intent(in) :: useQR
integer :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, ii, pp, transformChunkSize
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("bandred_real")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
success = .true.
! Semibandwith nbw must be a multiple of blocksize nblk
if (mod(nbw,nblk)/=0) then
if (my_prow==0 .and. my_pcol==0) then
if (wantDebug) then
write(error_unit,*) 'ELPA2_bandred_real: ERROR: nbw=',nbw,', nblk=',nblk
write(error_unit,*) 'ELPA2_bandred_real: ELPA2 works only for nbw==n*nblk'
endif
success = .false.
return
endif
endif
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
l_rows_tile = tile_size/np_rows ! local rows of a tile
l_cols_tile = tile_size/np_cols ! local cols of a tile
if (useQR) then
if (which_qr_decomposition == 1) then
call qr_pqrparam_init(pqrparam, nblk,'M',0, nblk,'M',0, nblk,'M',1,'s')
allocate(tauvector(na))
allocate(blockheuristic(nblk))
l_rows = local_index(na, my_prow, np_rows, nblk, -1)
allocate(vmr(max(l_rows,1),na))
call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), tmat(1,1,1), nbw, dwork_size(1), -1, na, &
nbw, nblk, nblk, na, na, 1, 0, PQRPARAM, mpi_comm_rows, mpi_comm_cols, blockheuristic)
work_size = dwork_size(1)
allocate(work_blocked(work_size))
work_blocked = 0.0d0
deallocate(vmr)
endif
endif
do istep = (na-1)/nbw, 1, -1
n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
! Number of local columns/rows of remaining matrix
l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
allocate(vmr(max(l_rows,1),2*n_cols))
allocate(umc(max(l_cols,1),2*n_cols))
allocate(vr(l_rows+1))
vmr(1:l_rows,1:n_cols) = 0.
vr(:) = 0
tmat(:,:,istep) = 0
! Reduce current block to lower triangular form
if (useQR) then
if (which_qr_decomposition == 1) then
call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), &
tmat(1,1,istep), nbw, work_blocked, &
work_size, na, n_cols, nblk, nblk, &
istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
0, PQRPARAM, mpi_comm_rows, mpi_comm_cols,&
blockheuristic)
endif
else
do lc = n_cols, 1, -1
ncol = istep*nbw + lc ! absolute column number of householder vector
nrow = ncol - nbw ! Absolute number of pivot row
lr = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
tau = 0
if (nrow == 1) exit ! Nothing to do
cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
if (my_pcol==cur_pcol) then
! Get vector to be transformed; distribute last element and norm of
! remaining elements to all procs in current column
vr(1:lr) = a(1:lr,lch) ! vector to be transformed
if (my_prow==prow(nrow, nblk, np_rows)) then
aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
aux1(2) = vr(lr)
else
aux1(1) = dot_product(vr(1:lr),vr(1:lr))
aux1(2) = 0.
endif
call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
vnorm2 = aux2(1)
vrl = aux2(2)
! Householder transformation
call hh_transform_real(vrl, vnorm2, xf, tau)
! Scale vr and store Householder vector for back transformation
vr(1:lr) = vr(1:lr) * xf
if (my_prow==prow(nrow, nblk, np_rows)) then
a(1:lr-1,lch) = vr(1:lr-1)
a(lr,lch) = vrl
vr(lr) = 1.
else
a(1:lr,lch) = vr(1:lr)
endif
endif
! Broadcast Householder vector and tau along columns
vr(lr+1) = tau
call MPI_Bcast(vr,lr+1,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
vmr(1:lr,lc) = vr(1:lr)
tau = vr(lr+1)
tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
! Transform remaining columns in current block with Householder vector
! Local dot product
aux1 = 0
#ifdef WITH_OPENMP
!Open up one omp region to avoid paying openmp overhead.
!This does not help performance due to the addition of two openmp barriers around the MPI call,
!But in the future this may be beneficial if these barriers are replaced with a faster implementation
!$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
mynlc = 0 ! number of local columns
!This loop does not have independent iterations,
!'mynlc' is incremented each iteration, and it is difficult to remove this dependency
!Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
!That is, a thread only executes the work associated with an iteration if its thread id is congruent to
!the iteration number modulo the number of threads
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0 ) then
mynlc = mynlc+1
if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
endif
endif
enddo
! Get global dot products
!$omp barrier
!$omp single
if (mynlc>0) call mpi_allreduce(aux1,aux2,mynlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
!$omp end single
!$omp barrier
! Transform
transformChunkSize=32
mynlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
mynlc = mynlc+1
!This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
!However, for some reason this is slower than doing it manually, so it is parallelized as below.
do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
do pp = 1,transformChunkSize
if (pp + ii > lr) exit
a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
enddo
enddo
endif
enddo
!$omp end parallel
#else
nlc = 0 ! number of local columns
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
endif
enddo
! Get global dot products
if (nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
endif
enddo
#endif
enddo
! Calculate scalar products of stored Householder vectors.
! This can be done in different ways, we use dsyrk
vav = 0
if (l_rows>0) &
call dsyrk('U','T',n_cols,l_rows,1.d0,vmr,ubound(vmr,dim=1),0.d0,vav,ubound(vav,dim=1))
call symm_matrix_allreduce(n_cols,vav, nbw, nbw,mpi_comm_rows)
! Calculate triangular matrix T for block Householder Transformation
do lc=n_cols,1,-1
tau = tmat(lc,lc,istep)
if (lc<n_cols) then
call dtrmv('U','T','N',n_cols-lc,tmat(lc+1,lc+1,istep),ubound(tmat,dim=1),vav(lc+1,lc),1)
tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
endif
enddo
endif
! Transpose vmr -> vmc (stored in umc, second half)
call elpa_transpose_vectors_real (vmr, ubound(vmr,dim=1), mpi_comm_rows, &
umc(1,n_cols+1), ubound(umc,dim=1), mpi_comm_cols, &
1, istep*nbw, n_cols, nblk)
! Calculate umc = A**T * vmr
! Note that the distributed A has to be transposed
! Opposed to direct tridiagonalization there is no need to use the cache locality
! of the tiles, so we can use strips of the matrix
!Code for Algorithm 4
n_way = 1
#ifdef WITH_OPENMP
n_way = omp_get_max_threads()
#endif
!umc(1:l_cols,1:n_cols) = 0.d0
!vmr(1:l_rows,n_cols+1:2*n_cols) = 0
#ifdef WITH_OPENMP
!$omp parallel private( i,lcs,lce,lrs,lre)
#endif
if(n_way > 1) then
!$omp do
do i=1,min(l_cols_tile, l_cols)
umc(i,1:n_cols) = 0.d0
enddo
!$omp do
do i=1,l_rows
vmr(i,n_cols+1:2*n_cols) = 0.d0
enddo
if (l_cols>0 .and. l_rows>0) then
!SYMM variant 4
!Partitioned Matrix Expression:
! Ct = Atl Bt + Atr Bb
! Cb = Atr' Bt + Abl Bb
!
!Loop invariant:
! Ct = Atl Bt + Atr Bb
!
!Update:
! C1 = A10'B0 + A11B1 + A21 B2
!
!This algorithm chosen because in this algoirhtm, the loop around the dgemm calls
!is easily parallelized, and regardless of choise of algorithm,
!the startup cost for parallelizing the dgemms inside the loop is too great
!$omp do schedule(static,1)
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1 ! local column start
lce = min(l_cols, (i+1)*l_cols_tile) ! local column end
lrs = i*l_rows_tile+1 ! local row start
lre = min(l_rows, (i+1)*l_rows_tile) ! local row end
!C1 += [A11 A12] [B1
! B2]
if( lre > lrs .and. l_cols > lcs ) then
call DGEMM('N','N', lre-lrs+1, n_cols, l_cols-lcs+1, &
1.d0, a(lrs,lcs), ubound(a,dim=1), &
umc(lcs,n_cols+1), ubound(umc,dim=1), &
0.d0, vmr(lrs,n_cols+1), ubound(vmr,dim=1))
endif
! C1 += A10' B0
if( lce > lcs .and. i > 0 ) then
call DGEMM('T','N', lce-lcs+1, n_cols, lrs-1, &
1.d0, a(1,lcs), ubound(a,dim=1), &
vmr(1,1), ubound(vmr,dim=1), &
0.d0, umc(lcs,1), ubound(umc,dim=1))
endif
enddo
endif
else
umc(1:l_cols,1:n_cols) = 0.d0
vmr(1:l_rows,n_cols+1:2*n_cols) = 0
if (l_cols>0 .and. l_rows>0) then
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
if (lce<lcs) cycle
lre = min(l_rows,(i+1)*l_rows_tile)
call DGEMM('T','N',lce-lcs+1,n_cols,lre,1.d0,a(1,lcs),ubound(a,dim=1), &
vmr,ubound(vmr,dim=1),1.d0,umc(lcs,1),ubound(umc,dim=1))
if (i==0) cycle
lre = min(l_rows,i*l_rows_tile)
call DGEMM('N','N',lre,n_cols,lce-lcs+1,1.d0,a(1,lcs),lda, &
umc(lcs,n_cols+1),ubound(umc,dim=1),1.d0,vmr(1,n_cols+1),ubound(vmr,dim=1))
enddo
endif
endif
#ifdef WITH_OPENMP
!$omp end parallel
#endif
! Sum up all ur(:) parts along rows and add them to the uc(:) parts
! on the processors containing the diagonal
! This is only necessary if ur has been calculated, i.e. if the
! global tile size is smaller than the global remaining matrix
! Or if we used the Algorithm 4
if (tile_size < istep*nbw .or. n_way > 1) then
call elpa_reduce_add_vectors_real (vmr(1,n_cols+1),ubound(vmr,dim=1),mpi_comm_rows, &
umc, ubound(umc,dim=1), mpi_comm_cols, &
istep*nbw, n_cols, nblk)
endif
if (l_cols>0) then
allocate(tmp(l_cols,n_cols))
call mpi_allreduce(umc,tmp,l_cols*n_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
deallocate(tmp)
endif
! U = U * Tmat**T
call dtrmm('Right','Upper','Trans','Nonunit',l_cols,n_cols,1.d0,tmat(1,1,istep),ubound(tmat,dim=1),umc,ubound(umc,dim=1))
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
call dgemm('T','N',n_cols,n_cols,l_cols,1.d0,umc,ubound(umc,dim=1),umc(1,n_cols+1), &
ubound(umc,dim=1),0.d0,vav,ubound(vav,dim=1))
call dtrmm('Right','Upper','Trans','Nonunit',n_cols,n_cols,1.d0,tmat(1,1,istep), &
ubound(tmat,dim=1),vav,ubound(vav,dim=1))
call symm_matrix_allreduce(n_cols,vav, nbw, nbw ,mpi_comm_cols)
! U = U - 0.5 * V * VAV
call dgemm('N','N',l_cols,n_cols,n_cols,-0.5d0,umc(1,n_cols+1),ubound(umc,dim=1),vav, &
ubound(vav,dim=1),1.d0,umc,ubound(umc,dim=1))
! Transpose umc -> umr (stored in vmr, second half)
call elpa_transpose_vectors_real (umc, ubound(umc,dim=1), mpi_comm_cols, &
vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
! A = A - V*U**T - U*V**T
#ifdef WITH_OPENMP
!$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend )
n_threads = omp_get_num_threads()
if(mod(n_threads, 2) == 0) then
n_way = 2
else
n_way = 1
endif
m_way = n_threads / n_way
m_id = mod(omp_get_thread_num(), m_way)
n_id = omp_get_thread_num() / m_way
do ii=n_id*tile_size,(istep*nbw-1),tile_size*n_way
i = ii / tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
lre = min(l_rows,(i+1)*l_rows_tile)
if (lce<lcs .or. lre<1) cycle
!Figure out this thread's range
work_per_thread = lre / m_way
if (work_per_thread * m_way < lre) work_per_thread = work_per_thread + 1
mystart = m_id * work_per_thread + 1
myend = mystart + work_per_thread - 1
if( myend > lre ) myend = lre
if( myend-mystart+1 < 1) cycle
call dgemm('N','T',myend-mystart+1, lce-lcs+1, 2*n_cols, -1.d0, &
vmr(mystart, 1), ubound(vmr,1), umc(lcs,1), ubound(umc,1), &
1.d0,a(mystart,lcs),ubound(a,1))
enddo
!$omp end parallel
#else /* WITH_OPENMP */
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
lre = min(l_rows,(i+1)*l_rows_tile)
if (lce<lcs .or. lre<1) cycle
call dgemm('N','T',lre,lce-lcs+1,2*n_cols,-1.d0, &
vmr,ubound(vmr,dim=1),umc(lcs,1),ubound(umc,dim=1), &
1.d0,a(1,lcs),lda)
enddo
#endif /* WITH_OPENMP */
deallocate(vmr, umc, vr)
enddo
if (useQR) then
if (which_qr_decomposition == 1) then
deallocate(work_blocked)
deallocate(tauvector)
endif
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("bandred_real")
#endif
end subroutine bandred_real
!-------------------------------------------------------------------------------
subroutine symm_matrix_allreduce(n,a,lda,ldb,comm)
!-------------------------------------------------------------------------------
! symm_matrix_allreduce: Does an mpi_allreduce for a symmetric matrix A.
! On entry, only the upper half of A needs to be set
! On exit, the complete matrix is set
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer :: n, lda, ldb, comm
real*8 :: a(lda,ldb)
integer :: i, nc, mpierr
real*8 :: h1(n*n), h2(n*n)
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("symm_matrix_allreduce")
#endif
nc = 0
do i=1,n
h1(nc+1:nc+i) = a(1:i,i)
nc = nc+i
enddo
call mpi_allreduce(h1,h2,nc,MPI_REAL8,MPI_SUM,comm,mpierr)
nc = 0
do i=1,n