Commit 85b2b68c authored by Alexander Heinecke's avatar Alexander Heinecke
Browse files

removed multiple blocking in band_to_full

parent 13ef90f3
......@@ -684,9 +684,9 @@ subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq,
integer l_cols, l_rows, l_colh, n_cols
integer istep, lc, ncol, nrow, nb, ns
real*8, allocatable:: tmp1(:), tmp2(:), hvb(:), hvm(:,:), tmat_complete(:,:), t_tmp(:,:), t_tmp2(:,:)
real*8, allocatable:: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
integer pcol, prow, i, cwy_blocking, t_blocking, t_cols, t_rows
integer pcol, prow, i
pcol(i) = MOD((i-1)/nblk,np_cols) !Processor col for global col number
prow(i) = MOD((i-1)/nblk,np_rows) !Processor row for global row number
......@@ -696,32 +696,25 @@ subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq,
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
max_blocks_row = ((na -1)/nblk)/np_rows + 1 ! Rows of A
max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q!
max_local_rows = max_blocks_row*nblk
max_local_cols = max_blocks_col*nblk
t_blocking = 2 ! number of matrices T (tmat) which are aggregated into a new (larger) T matrix (tmat_complete) and applied at once
cwy_blocking = t_blocking * nbw
allocate(tmp1(max_local_cols*cwy_blocking))
allocate(tmp2(max_local_cols*cwy_blocking))
allocate(hvb(max_local_rows*cwy_blocking))
allocate(hvm(max_local_rows,cwy_blocking))
allocate(tmat_complete(cwy_blocking,cwy_blocking))
allocate(t_tmp(cwy_blocking,nbw))
allocate(t_tmp2(cwy_blocking,nbw))
allocate(tmp1(max_local_cols*nbw))
allocate(tmp2(max_local_cols*nbw))
allocate(hvb(max_local_rows*nbw))
allocate(hvm(max_local_rows,nbw))
hvm = 0 ! Must be set to 0 !!!
hvb = 0 ! Safety only
l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
do istep=1,((na-1)/nbw-1)/t_blocking + 1
do istep=1,(na-1)/nbw
n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step
n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
! Broadcast all Householder vectors for current step compressed in hvb
......@@ -729,7 +722,7 @@ subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq,
ns = 0
do lc = 1, n_cols
ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder vector
ncol = istep*nbw + lc ! absolute column number of householder vector
nrow = ncol - nbw ! absolute number of pivot row
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
......@@ -749,7 +742,7 @@ subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq,
nb = 0
do lc = 1, n_cols
nrow = (istep-1)*cwy_blocking + lc ! absolute number of pivot row
nrow = (istep-1)*nbw+lc ! absolute number of pivot row
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
......@@ -758,40 +751,26 @@ subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq,
nb = nb+l_rows
enddo
l_rows = local_index(MIN(na,(istep+1)*cwy_blocking), my_prow, np_rows, nblk, -1)
! compute tmat2 out of tmat(:,:,)
tmat_complete = 0
do i = 1, t_blocking
t_cols = MIN(nbw, n_cols - (i-1)*nbw)
if(t_cols <= 0) exit
t_rows = (i - 1) * nbw
tmat_complete(t_rows+1:t_rows+t_cols,t_rows+1:t_rows+t_cols) = tmat(1:t_cols,1:t_cols,(istep-1)*t_blocking + i)
if(i > 1) then
call dgemm('T', 'N', t_rows, t_cols, l_rows, 1.d0, hvm(1,1), max_local_rows, hvm(1,(i-1)*nbw+1), max_local_rows, 0.d0, t_tmp, cwy_blocking)
call mpi_allreduce(t_tmp,t_tmp2,cwy_blocking*nbw,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
call dtrmm('L','U','N','N',t_rows,t_cols,1.0d0,tmat_complete,cwy_blocking,t_tmp2,cwy_blocking)
call dtrmm('R','U','N','N',t_rows,t_cols,-1.0d0,tmat_complete(t_rows+1,t_rows+1),cwy_blocking,t_tmp2,cwy_blocking)
tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)
endif
enddo
l_rows = local_index(MIN(na,(istep+1)*nbw), my_prow, np_rows, nblk, -1)
! Q = Q - V * T**T * V**T * Q
if(l_rows>0) then
call dgemm('T','N',n_cols,l_cols,l_rows,1.d0,hvm,ubound(hvm,1), q,ldq,0.d0,tmp1,n_cols)
call dgemm('T','N',n_cols,l_cols,l_rows,1.d0,hvm,ubound(hvm,1), &
q,ldq,0.d0,tmp1,n_cols)
else
tmp1(1:l_cols*n_cols) = 0
endif
call mpi_allreduce(tmp1,tmp2,n_cols*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
if(l_rows>0) then
call dtrmm('L','U','T','N',n_cols,l_cols,1.0d0,tmat_complete,cwy_blocking,tmp2,n_cols)
call dgemm('N','N',l_rows,l_cols,n_cols,-1.d0,hvm,ubound(hvm,1), tmp2,n_cols,1.d0,q,ldq)
call dtrmm('L','U','T','N',n_cols,l_cols,1.0d0,tmat(1,1,istep),ubound(tmat,1),tmp2,n_cols)
call dgemm('N','N',l_rows,l_cols,n_cols,-1.d0,hvm,ubound(hvm,1), &
tmp2,n_cols,1.d0,q,ldq)
endif
enddo
deallocate(tmp1, tmp2, hvb, hvm, tmat_complete, t_tmp, t_tmp2)
deallocate(tmp1, tmp2, hvb, hvm)
end subroutine trans_ev_band_to_full_real
......@@ -2208,9 +2187,9 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld
integer l_cols, l_rows, l_colh, n_cols
integer istep, lc, ncol, nrow, nb, ns
complex*16, allocatable:: tmp1(:), tmp2(:), hvb(:), hvm(:,:), tmat_complete(:,:), t_tmp(:,:), t_tmp2(:,:)
complex*16, allocatable:: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
integer pcol, prow, i, cwy_blocking, t_blocking, t_cols, t_rows
integer pcol, prow, i
pcol(i) = MOD((i-1)/nblk,np_cols) !Processor col for global col number
prow(i) = MOD((i-1)/nblk,np_rows) !Processor row for global row number
......@@ -2220,32 +2199,25 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
max_blocks_row = ((na -1)/nblk)/np_rows + 1 ! Rows of A
max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q!
max_local_rows = max_blocks_row*nblk
max_local_cols = max_blocks_col*nblk
t_blocking = 2 ! number of matrices T (tmat) which are aggregated into a new (larger) T matrix (tmat_complete) and applied at once
cwy_blocking = t_blocking * nbw
allocate(tmp1(max_local_cols*cwy_blocking))
allocate(tmp2(max_local_cols*cwy_blocking))
allocate(hvb(max_local_rows*cwy_blocking))
allocate(hvm(max_local_rows,cwy_blocking))
allocate(tmat_complete(cwy_blocking,cwy_blocking))
allocate(t_tmp(cwy_blocking,nbw))
allocate(t_tmp2(cwy_blocking,nbw))
allocate(tmp1(max_local_cols*nbw))
allocate(tmp2(max_local_cols*nbw))
allocate(hvb(max_local_rows*nbw))
allocate(hvm(max_local_rows,nbw))
hvm = 0 ! Must be set to 0 !!!
hvb = 0 ! Safety only
l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
do istep=1,((na-1)/nbw-1)/t_blocking + 1
do istep=1,(na-1)/nbw
n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step
n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
! Broadcast all Householder vectors for current step compressed in hvb
......@@ -2253,7 +2225,7 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld
ns = 0
do lc = 1, n_cols
ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder vector
ncol = istep*nbw + lc ! absolute column number of householder vector
nrow = ncol - nbw ! absolute number of pivot row
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
......@@ -2273,7 +2245,7 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld
nb = 0
do lc = 1, n_cols
nrow = (istep-1)*cwy_blocking + lc ! absolute number of pivot row
nrow = (istep-1)*nbw+lc ! absolute number of pivot row
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
......@@ -2282,25 +2254,9 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld
nb = nb+l_rows
enddo
l_rows = local_index(MIN(na,(istep+1)*cwy_blocking), my_prow, np_rows, nblk, -1)
l_rows = local_index(MIN(na,(istep+1)*nbw), my_prow, np_rows, nblk, -1)
! Q = Q - V * T**T * V**T * Q
! compute tmat2 out of tmat(:,:,)
tmat_complete(:,:) = CZERO
do i = 1, t_blocking
t_cols = MIN(nbw, n_cols - (i-1)*nbw)
if(t_cols <= 0) exit
t_rows = (i - 1) * nbw
tmat_complete(t_rows+1:t_rows+t_cols,t_rows+1:t_rows+t_cols) = tmat(1:t_cols,1:t_cols,(istep-1)*t_blocking + i)
if(i > 1) then
call zgemm('C', 'N', t_rows, t_cols, l_rows, CONE, hvm(1,1), max_local_rows, hvm(1,(i-1)*nbw+1), max_local_rows, CZERO, t_tmp, cwy_blocking)
call mpi_allreduce(t_tmp,t_tmp2,cwy_blocking*nbw,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
call ztrmm('L','U','N','N',t_rows,t_cols,CONE,tmat_complete,cwy_blocking,t_tmp2,cwy_blocking)
call ztrmm('R','U','N','N',t_rows,t_cols,-CONE,tmat_complete(t_rows+1,t_rows+1),cwy_blocking,t_tmp2,cwy_blocking)
tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)
endif
enddo
if(l_rows>0) then
call zgemm('C','N',n_cols,l_cols,l_rows,CONE,hvm,ubound(hvm,1), &
......@@ -2310,14 +2266,14 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld
endif
call mpi_allreduce(tmp1,tmp2,n_cols*l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
if(l_rows>0) then
call ztrmm('L','U','C','N',n_cols,l_cols,CONE,tmat_complete,cwy_blocking,tmp2,n_cols)
call ztrmm('L','U','C','N',n_cols,l_cols,CONE,tmat(1,1,istep),ubound(tmat,1),tmp2,n_cols)
call zgemm('N','N',l_rows,l_cols,n_cols,-CONE,hvm,ubound(hvm,1), &
tmp2,n_cols,CONE,q,ldq)
endif
enddo
deallocate(tmp1, tmp2, hvb, hvm, tmat_complete, t_tmp, t_tmp2)
deallocate(tmp1, tmp2, hvb, hvm)
end subroutine trans_ev_band_to_full_complex
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment