Commit c45e001d authored by Alexander Heinecke's avatar Alexander Heinecke
Browse files

added blocking for complex band to full

parent 545881e6
...@@ -2208,9 +2208,9 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld ...@@ -2208,9 +2208,9 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld
integer l_cols, l_rows, l_colh, n_cols integer l_cols, l_rows, l_colh, n_cols
integer istep, lc, ncol, nrow, nb, ns integer istep, lc, ncol, nrow, nb, ns
complex*16, allocatable:: tmp1(:), tmp2(:), hvb(:), hvm(:,:) complex*16, allocatable:: tmp1(:), tmp2(:), hvb(:), hvm(:,:), tmat_complete(:,:), t_tmp(:,:), t_tmp2(:,:)
integer pcol, prow, i integer pcol, prow, i, cwy_blocking, t_blocking, t_cols, t_rows
pcol(i) = MOD((i-1)/nblk,np_cols) !Processor col for global col number pcol(i) = MOD((i-1)/nblk,np_cols) !Processor col for global col number
prow(i) = MOD((i-1)/nblk,np_rows) !Processor row for global row number prow(i) = MOD((i-1)/nblk,np_rows) !Processor row for global row number
...@@ -2227,19 +2227,25 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld ...@@ -2227,19 +2227,25 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld
max_local_rows = max_blocks_row*nblk max_local_rows = max_blocks_row*nblk
max_local_cols = max_blocks_col*nblk max_local_cols = max_blocks_col*nblk
allocate(tmp1(max_local_cols*nbw)) t_blocking = 2 ! number of matrices T (tmat) which are aggregated into a new (larger) T matrix (tmat_complete) and applied at once
allocate(tmp2(max_local_cols*nbw)) cwy_blocking = t_blocking * nbw
allocate(hvb(max_local_rows*nbw))
allocate(hvm(max_local_rows,nbw)) allocate(tmp1(max_local_cols*cwy_blocking))
allocate(tmp2(max_local_cols*cwy_blocking))
allocate(hvb(max_local_rows*cwy_blocking))
allocate(hvm(max_local_rows,cwy_blocking))
allocate(tmat_complete(cwy_blocking,cwy_blocking))
allocate(t_tmp(cwy_blocking,nbw))
allocate(t_tmp2(cwy_blocking,nbw))
hvm = 0 ! Must be set to 0 !!! hvm = 0 ! Must be set to 0 !!!
hvb = 0 ! Safety only hvb = 0 ! Safety only
l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
do istep=1,(na-1)/nbw do istep=1,((na-1)/nbw-1)/t_blocking + 1
n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step
! Broadcast all Householder vectors for current step compressed in hvb ! Broadcast all Householder vectors for current step compressed in hvb
...@@ -2247,7 +2253,7 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld ...@@ -2247,7 +2253,7 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld
ns = 0 ns = 0
do lc = 1, n_cols do lc = 1, n_cols
ncol = istep*nbw + lc ! absolute column number of householder vector ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder vector
nrow = ncol - nbw ! absolute number of pivot row nrow = ncol - nbw ! absolute number of pivot row
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
...@@ -2267,7 +2273,7 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld ...@@ -2267,7 +2273,7 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld
nb = 0 nb = 0
do lc = 1, n_cols do lc = 1, n_cols
nrow = (istep-1)*nbw+lc ! absolute number of pivot row nrow = (istep-1)*cwy_blocking + lc ! absolute number of pivot row
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows) hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
...@@ -2276,10 +2282,26 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld ...@@ -2276,10 +2282,26 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld
nb = nb+l_rows nb = nb+l_rows
enddo enddo
l_rows = local_index(MIN(na,(istep+1)*nbw), my_prow, np_rows, nblk, -1) l_rows = local_index(MIN(na,(istep+1)*cwy_blocking), my_prow, np_rows, nblk, -1)
! Q = Q - V * T**T * V**T * Q ! Q = Q - V * T**T * V**T * Q
! compute tmat2 out of tmat(:,:,)
tmat_complete(:,:) = CZERO
do i = 1, t_blocking
t_cols = MIN(nbw, n_cols - (i-1)*nbw)
if(t_cols <= 0) exit
t_rows = (i - 1) * nbw
tmat_complete(t_rows+1:t_rows+t_cols,t_rows+1:t_rows+t_cols) = tmat(1:t_cols,1:t_cols,(istep-1)*t_blocking + i)
if(i > 1) then
call zgemm('C', 'N', t_rows, t_cols, l_rows, CONE, hvm(1,1), max_local_rows, hvm(1,(i-1)*nbw+1), max_local_rows, CZERO, t_tmp, cwy_blocking)
call mpi_allreduce(t_tmp,t_tmp2,cwy_blocking*nbw,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
call ztrmm('L','U','N','N',t_rows,t_cols,CONE,tmat_complete,cwy_blocking,t_tmp2,cwy_blocking)
call ztrmm('R','U','N','N',t_rows,t_cols,-CONE,tmat_complete(t_rows+1,t_rows+1),cwy_blocking,t_tmp2,cwy_blocking)
tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)
endif
enddo
if(l_rows>0) then if(l_rows>0) then
call zgemm('C','N',n_cols,l_cols,l_rows,CONE,hvm,ubound(hvm,1), & call zgemm('C','N',n_cols,l_cols,l_rows,CONE,hvm,ubound(hvm,1), &
q,ldq,CZERO,tmp1,n_cols) q,ldq,CZERO,tmp1,n_cols)
...@@ -2288,14 +2310,14 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld ...@@ -2288,14 +2310,14 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld
endif endif
call mpi_allreduce(tmp1,tmp2,n_cols*l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr) call mpi_allreduce(tmp1,tmp2,n_cols*l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
if(l_rows>0) then if(l_rows>0) then
call ztrmm('L','U','C','N',n_cols,l_cols,CONE,tmat(1,1,istep),ubound(tmat,1),tmp2,n_cols) call ztrmm('L','U','C','N',n_cols,l_cols,CONE,tmat_complete,cwy_blocking,tmp2,n_cols)
call zgemm('N','N',l_rows,l_cols,n_cols,-CONE,hvm,ubound(hvm,1), & call zgemm('N','N',l_rows,l_cols,n_cols,-CONE,hvm,ubound(hvm,1), &
tmp2,n_cols,CONE,q,ldq) tmp2,n_cols,CONE,q,ldq)
endif endif
enddo enddo
deallocate(tmp1, tmp2, hvb, hvm) deallocate(tmp1, tmp2, hvb, hvm, tmat_complete, t_tmp, t_tmp2)
end subroutine trans_ev_band_to_full_complex end subroutine trans_ev_band_to_full_complex
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment