Commit 2626ebcc authored by Thomas Auckenthaler's avatar Thomas Auckenthaler
Browse files

Band-band reduction (band_band_real) added to elpa2. divide_band is now defined as public.

parent d7a57612
......@@ -29,6 +29,9 @@ module ELPA2
public :: tridiag_band_complex
public :: trans_ev_tridi_to_band_complex
public :: trans_ev_band_to_full_complex
public :: band_band_real
public :: divide_band
!-------------------------------------------------------------------------------
......@@ -3409,4 +3412,333 @@ subroutine divide_band(nblocks_total, n_pes, block_limits)
end subroutine
!-------------------------------------------------------------------------------
subroutine band_band_real(na, nb, nb2, ab, ab2, d, e, mpi_comm)
!-------------------------------------------------------------------------------
! band_band_real:
! Reduces a real symmetric banded matrix to a real symmetric matrix with smaller bandwidth. Householder transformations are not stored.
! Matrix size na and original bandwidth nb have to be a multiple of the target bandwidth nb2. (Hint: expand your matrix with zero entries, if this
! requirement doesn't hold)
!
! na Order of matrix
!
! nb Semi bandwidth of original matrix
!
! nb2 Semi bandwidth of target matrix
!
! ab Input matrix with bandwidth nb. The leading dimension of the banded matrix has to be 2*nb. The parallel data layout
! has to be accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb+1 to min(na, block_limits(n+1)*nb)
! are located on rank n.
!
! ab2 Output matrix with bandwidth nb2. The leading dimension of the banded matrix is 2*nb2. The parallel data layout is
! accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb2+1 to min(na, block_limits(n+1)*nb2) are located
! on rank n.
!
! d(na) Diagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output)
!
! e(na) Subdiagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output)
!
! mpi_comm
! MPI-Communicator for the total processor set
!-------------------------------------------------------------------------------
implicit none
integer, intent(in) :: na, nb, nb2, mpi_comm
real*8, intent(inout) :: ab(2*nb,*)
real*8, intent(inout) :: ab2(2*nb2,*)
real*8, intent(out) :: d(na), e(na) ! set only on PE 0
!----------------
real*8 hv(nb,nb2), w(nb,nb2), w_new(nb,nb2), tau(nb2), hv_new(nb,nb2), tau_new(nb2), ab_s(1+nb,nb2), ab_r(1+nb,nb2), ab_s2(2*nb2,nb2), hv_s(nb,nb2)
real*8 work(nb*nb2), work2(nb2*nb2)
integer lwork, info
integer istep, i, n, dest
integer n_off, na_s
integer my_pe, n_pes, mpierr
integer nblocks_total, nblocks
integer nblocks_total2, nblocks2
integer ireq_ab, ireq_hv
integer, allocatable :: block_limits(:), block_limits2(:), ireq_ab2(:)
!----------------
integer j, nc, nr, ns, ne, iblk
call mpi_comm_rank(mpi_comm,my_pe,mpierr)
call mpi_comm_size(mpi_comm,n_pes,mpierr)
! Total number of blocks in the band:
nblocks_total = (na-1)/nb + 1
nblocks_total2 = (na-1)/nb2 + 1
! Set work distribution
allocate(block_limits(0:n_pes))
call divide_band(nblocks_total, n_pes, block_limits)
allocate(block_limits2(0:n_pes))
call divide_band(nblocks_total2, n_pes, block_limits2)
! nblocks: the number of blocks for my task
nblocks = block_limits(my_pe+1) - block_limits(my_pe)
nblocks2 = block_limits2(my_pe+1) - block_limits2(my_pe)
allocate(ireq_ab2(1:nblocks2))
ireq_ab2 = MPI_REQUEST_NULL
if(nb2>1) then
do i=0,nblocks2-1
call mpi_irecv(ab2(1,i*nb2+1),2*nb2*nb2,mpi_real8,0,3,mpi_comm,ireq_ab2(i+1),mpierr)
enddo
endif
! n_off: Offset of ab within band
n_off = block_limits(my_pe)*nb
lwork = nb*nb2
dest = 0
ireq_ab = MPI_REQUEST_NULL
ireq_hv = MPI_REQUEST_NULL
! ---------------------------------------------------------------------------
! Start of calculations
na_s = block_limits(my_pe)*nb + 1
if(my_pe>0 .and. na_s<=na) then
! send first nb2 columns to previous PE
! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
do i=1,nb2
ab_s(1:nb+1,i) = ab(1:nb+1,na_s-n_off+i-1)
enddo
call mpi_isend(ab_s,(nb+1)*nb2,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
endif
do istep=1,na/nb2
if(my_pe==0) then
n = MIN(na-na_s-nb2+1,nb) ! number of rows to be reduced
hv(:,:) = 0
tau(:) = 0
! The last step (istep=na-1) is only needed for sending the last HH vectors.
! We don't want the sign of the last element flipped (analogous to the other sweeps)
if(istep < na/nb2) then
! Transform first block column of remaining matrix
call dgeqrf(n, nb2, ab(1+nb2,na_s-n_off), 2*nb-1, tau, work, lwork, info);
do i=1,nb2
hv(i,i) = 1.0
hv(i+1:n,i) = ab(1+nb2+1:1+nb2+n-i,na_s-n_off+i-1)
ab(1+nb2+1:2*nb,na_s-n_off+i-1) = 0
enddo
endif
if(nb2==1) then
d(istep) = ab(1,na_s-n_off)
e(istep) = ab(2,na_s-n_off)
if(istep == na) then
e(na) = 0
endif
else
ab_s2 = 0
ab_s2(:,:) = ab(1:nb2+1,na_s-n_off:na_s-n_off+nb2-1)
if(block_limits2(dest+1)<istep) then
dest = dest+1
endif
call mpi_send(ab_s2,2*nb2*nb2,mpi_real8,dest,3,mpi_comm,mpierr)
endif
else
if(na>na_s+nb2-1) then
! Receive Householder vectors from previous task, from PE owning subdiagonal
call mpi_recv(hv,nb*nb2,mpi_real8,my_pe-1,2,mpi_comm,MPI_STATUS_IGNORE,mpierr)
do i=1,nb2
tau(i) = hv(i,i)
hv(i,i) = 1.
enddo
endif
endif
na_s = na_s+nb2
if(na_s-n_off > nb) then
ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0
n_off = n_off + nb
endif
do iblk=1,nblocks
ns = na_s + (iblk-1)*nb - n_off ! first column in block
ne = ns+nb-nb2 ! last column in block
if(ns+n_off>na) exit
nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
! Note that nr>=0 implies that diagonal block is full (nc==nb)!
call wy_gen(nc,nb2,w,hv,tau,work,nb)
if(iblk==nblocks .and. nc==nb) then
!request last nb2 columns
call mpi_recv(ab_r,(nb+1)*nb2,mpi_real8,my_pe+1,1,mpi_comm,MPI_STATUS_IGNORE,mpierr)
do i=1,nb2
ab(1:nb+1,ne+i-1) = ab_r(:,i)
enddo
endif
hv_new(:,:) = 0 ! Needed, last rows must be 0 for nr < nb
tau_new(:) = 0
if(nr>0) then
call wy_right(nr,nb,nb2,ab(nb+1,ns),2*nb-1,w,hv,work,nb)
call dgeqrf(nr,nb2,ab(nb+1,ns),2*nb-1,tau_new,work,lwork,info);
do i=1,nb2
hv_new(i,i) = 1.0
hv_new(i+1:,i) = ab(nb+2:2*nb-i+1,ns+i-1)
ab(nb+2:,ns+i-1) = 0
enddo
!send hh-vector
if(iblk==nblocks) then
call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
hv_s = hv_new
do i=1,nb2
hv_s(i,i) = tau_new(i)
enddo
call mpi_isend(hv_s,nb*nb2,mpi_real8,my_pe+1,2,mpi_comm,ireq_hv,mpierr)
endif
endif
call wy_symm(nc,nb2,ab(1,ns),2*nb-1,w,hv,work,work2,nb)
if(my_pe>0 .and. iblk==1) then
!send first nb2 columns to previous PE
call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
do i=1,nb2
ab_s(1:nb+1,i) = ab(1:nb+1,ns+i-1)
enddo
call mpi_isend(ab_s,(nb+1)*nb2,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
endif
if(nr>0) then
call wy_gen(nr,nb2,w_new,hv_new,tau_new,work,nb)
call wy_left(nb-nb2,nr,nb2,ab(nb+1-nb2,ns+nb2),2*nb-1,w_new,hv_new,work,nb)
endif
! Use new HH vector for the next block
hv(:,:) = hv_new(:,:)
tau = tau_new
enddo
enddo
! Finish the last outstanding requests
call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
call mpi_waitall(nblocks2,ireq_ab2,MPI_STATUSES_IGNORE,mpierr)
call mpi_barrier(mpi_comm,mpierr)
deallocate(block_limits)
deallocate(block_limits2)
deallocate(ireq_ab2)
end subroutine
! --------------------------------------------------------------------------------------------------
subroutine wy_gen(n, nb, W, Y, tau, mem, lda)
integer, intent(in) :: n !length of householder-vectors
integer, intent(in) :: nb !number of householder-vectors
integer, intent(in) :: lda !leading dimension of Y and W
real*8, intent(in) :: Y(lda,nb) !matrix containing nb householder-vectors of length b
real*8, intent(in) :: tau(nb) !tau values
real*8, intent(out) :: W(lda,nb) !output matrix W
real*8, intent(in) :: mem(nb) !memory for a temporary matrix of size nb
integer i
W(1:n,1) = tau(1)*Y(1:n,1)
do i=2,nb
W(1:n,i) = tau(i)*Y(1:n,i)
call DGEMV('T',n,i-1,1.d0,Y,lda,W(1,i),1,0.d0,mem,1)
call DGEMV('N',n,i-1,-1.d0,W,lda,mem,1,1.d0,W(1,i),1)
enddo
end subroutine
! --------------------------------------------------------------------------------------------------
subroutine wy_left(n, m, nb, A, lda, W, Y, mem, lda2)
integer, intent(in) :: n !width of the matrix A
integer, intent(in) :: m !length of matrix W and Y
integer, intent(in) :: nb !width of matrix W and Y
integer, intent(in) :: lda !leading dimension of A
integer, intent(in) :: lda2 !leading dimension of W and Y
real*8, intent(inout) :: A(lda,*) !matrix to be transformed
real*8, intent(in) :: W(m,nb) !blocked transformation matrix W
real*8, intent(in) :: Y(m,nb) !blocked transformation matrix Y
real*8, intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb
call DGEMM('T', 'N', nb, n, m, 1.d0, W, lda2, A, lda, 0.d0, mem, nb)
call DGEMM('N', 'N', m, n, nb, -1.d0, Y, lda2, mem, nb, 1.d0, A, lda)
end subroutine
! --------------------------------------------------------------------------------------------------
subroutine wy_right(n, m, nb, A, lda, W, Y, mem, lda2)
integer, intent(in) :: n !height of the matrix A
integer, intent(in) :: m !length of matrix W and Y
integer, intent(in) :: nb !width of matrix W and Y
integer, intent(in) :: lda !leading dimension of A
integer, intent(in) :: lda2 !leading dimension of W and Y
real*8, intent(inout) :: A(lda,*) !matrix to be transformed
real*8, intent(in) :: W(m,nb) !blocked transformation matrix W
real*8, intent(in) :: Y(m,nb) !blocked transformation matrix Y
real*8, intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb
call DGEMM('N', 'N', n, nb, m, 1.d0, A, lda, W, lda2, 0.d0, mem, n)
call DGEMM('N', 'T', n, m, nb, -1.d0, mem, n, Y, lda2, 1.d0, A, lda)
end subroutine
! --------------------------------------------------------------------------------------------------
subroutine wy_symm(n, nb, A, lda, W, Y, mem, mem2, lda2)
integer, intent(in) :: n !width/heigth of the matrix A; length of matrix W and Y
integer, intent(in) :: nb !width of matrix W and Y
integer, intent(in) :: lda !leading dimension of A
integer, intent(in) :: lda2 !leading dimension of W and Y
real*8, intent(inout) :: A(lda,*) !matrix to be transformed
real*8, intent(in) :: W(n,nb) !blocked transformation matrix W
real*8, intent(in) :: Y(n,nb) !blocked transformation matrix Y
real*8 :: mem(n,nb) !memory for a temporary matrix of size n x nb
real*8 :: mem2(nb,nb) !memory for a temporary matrix of size nb x nb
call DSYMM('L', 'L', n, nb, 1.d0, A, lda, W, lda2, 0.d0, mem, n)
call DGEMM('T', 'N', nb, nb, n, 1.d0, mem, n, W, lda2, 0.d0, mem2, nb)
call DGEMM('N', 'N', n, nb, nb, -0.5d0, Y, lda2, mem2, nb, 1.d0, mem, n)
call DSYR2K('L', 'N', n, nb, -1.d0, Y, lda2, mem, n, 1.d0, A, lda)
end subroutine
! --------------------------------------------------------------------------------------------------
end module ELPA2
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment