Commit b015b6e1 authored by Andreas Marek's avatar Andreas Marek
Browse files

Elpa release 2013.08

This commit releases ELPA version 2013.08

It is identical with the latest commit version 2013.02_BETA, except:

- rename ELPA_2013.02.BETA -> ELPA_2013.08
- update of configure.ac
- remove of src/elpa2.f90_save
- remove of test/Makefile

This version has been tested extensivly, however, there might always
be some bugs.
In case of questions please contact elpa-library@rzg.mpg.de or via
the ELPA forum on http://elpa-lib.fhi-berlin.mpg.de/forum/index.php
parent a74290c7
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.rzg.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
module ELPA2
! Version 1.1.2, 2011-02-21
USE ELPA1
implicit none
PRIVATE ! By default, all routines contained are private
! The following routines are public:
public :: solve_evp_real_2stage
public :: solve_evp_complex_2stage
public :: bandred_real
public :: tridiag_band_real
public :: trans_ev_tridi_to_band_real
public :: trans_ev_band_to_full_real
public :: bandred_complex
public :: tridiag_band_complex
public :: trans_ev_tridi_to_band_complex
public :: trans_ev_band_to_full_complex
!-------------------------------------------------------------------------------
! The following array contains the Householder vectors of the
! transformation band -> tridiagonal.
! It is allocated and set in tridiag_band_real and used in
! trans_ev_tridi_to_band_real.
! It must be deallocated by the user after trans_ev_tridi_to_band_real!
real*8, allocatable :: hh_trans_real(:,:)
complex*16, allocatable :: hh_trans_complex(:,:)
!-------------------------------------------------------------------------------
include 'mpif.h'
!******
contains
subroutine solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
!-------------------------------------------------------------------------------
! solve_evp_real_2stage: Solves the real eigenvalue problem with a 2 stage approach
!
! Parameters
!
! na Order of matrix a
!
! nev Number of eigenvalues needed
!
! a(lda,*) Distributed matrix for which eigenvalues are to be computed.
! Distribution is like in Scalapack.
! The full matrix must be set (not only one half like in scalapack).
! Destroyed on exit (upper and lower half).
!
! lda Leading dimension of a
!
! ev(na) On output: eigenvalues of a, every processor gets the complete set
!
! q(ldq,*) On output: Eigenvectors of a
! Distribution is like in Scalapack.
! Must be always dimensioned to the full size (corresponding to (na,na))
! even if only a part of the eigenvalues is needed.
!
! ldq Leading dimension of q
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
! mpi_comm_all
! MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
implicit none
integer, intent(in) :: na, nev, lda, ldq, nblk, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
real*8, intent(inout) :: a(lda,*), ev(na), q(ldq,*)
integer my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
integer nbw, num_blocks
real*8, allocatable :: tmat(:,:,:), e(:)
real*8 ttt0, ttt1, ttts
call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
nbw = (31/nblk+1)*nblk
num_blocks = (na-1)/nbw + 1
allocate(tmat(nbw,nbw,num_blocks))
! Reduction full -> band
ttt0 = MPI_Wtime()
ttts = ttt0
call bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, tmat)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
print 1,'Time bandred_real :',ttt1-ttt0
! Reduction band -> tridiagonal
allocate(e(na))
ttt0 = MPI_Wtime()
call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
print 1,'Time tridiag_band_real :',ttt1-ttt0
call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
ttt1 = MPI_Wtime()
time_evp_fwd = ttt1-ttts
! Solve tridiagonal system
ttt0 = MPI_Wtime()
call solve_tridi(na, nev, ev, e, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
print 1,'Time solve_tridi :',ttt1-ttt0
time_evp_solve = ttt1-ttt0
ttts = ttt1
deallocate(e)
! Backtransform stage 1
ttt0 = MPI_Wtime()
call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows, mpi_comm_cols)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
print 1,'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
! We can now deallocate the stored householder vectors
deallocate(hh_trans_real)
! Backtransform stage 2
ttt0 = MPI_Wtime()
call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, mpi_comm_rows, mpi_comm_cols)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
print 1,'Time trans_ev_band_to_full_real :',ttt1-ttt0
time_evp_back = ttt1-ttts
deallocate(tmat)
1 format(a,f10.3)
end subroutine solve_evp_real_2stage
!-------------------------------------------------------------------------------
!-------------------------------------------------------------------------------
subroutine solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
!-------------------------------------------------------------------------------
! solve_evp_complex_2stage: Solves the complex eigenvalue problem with a 2 stage approach
!
! Parameters
!
! na Order of matrix a
!
! nev Number of eigenvalues needed
!
! a(lda,*) Distributed matrix for which eigenvalues are to be computed.
! Distribution is like in Scalapack.
! The full matrix must be set (not only one half like in scalapack).
! Destroyed on exit (upper and lower half).
!
! lda Leading dimension of a
!
! ev(na) On output: eigenvalues of a, every processor gets the complete set
!
! q(ldq,*) On output: Eigenvectors of a
! Distribution is like in Scalapack.
! Must be always dimensioned to the full size (corresponding to (na,na))
! even if only a part of the eigenvalues is needed.
!
! ldq Leading dimension of q
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
! mpi_comm_all
! MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
implicit none
integer, intent(in) :: na, nev, lda, ldq, nblk, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
complex*16, intent(inout) :: a(lda,*), q(ldq,*)
real*8, intent(inout) :: ev(na)
integer my_prow, my_pcol, np_rows, np_cols, mpierr
integer l_cols, l_rows, l_cols_nev, nbw, num_blocks
complex*16, allocatable :: tmat(:,:,:)
real*8, allocatable :: q_real(:,:), e(:)
real*8 ttt0, ttt1, ttts
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
nbw = (31/nblk+1)*nblk
num_blocks = (na-1)/nbw + 1
allocate(tmat(nbw,nbw,num_blocks))
! Reduction full -> band
ttt0 = MPI_Wtime()
ttts = ttt0
call bandred_complex(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, tmat)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
print 1,'Time bandred_complex :',ttt1-ttt0
! Reduction band -> tridiagonal
allocate(e(na))
ttt0 = MPI_Wtime()
call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
print 1,'Time tridiag_band_complex :',ttt1-ttt0
call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
ttt1 = MPI_Wtime()
time_evp_fwd = ttt1-ttts
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev
allocate(q_real(l_rows,l_cols))
! Solve tridiagonal system
ttt0 = MPI_Wtime()
call solve_tridi(na, nev, ev, e, q_real, ubound(q_real,1), nblk, mpi_comm_rows, mpi_comm_cols)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
print 1,'Time solve_tridi :',ttt1-ttt0
time_evp_solve = ttt1-ttt0
ttts = ttt1
q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
deallocate(e, q_real)
! Backtransform stage 1
ttt0 = MPI_Wtime()
call trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_rows, mpi_comm_cols)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
print 1,'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0
! We can now deallocate the stored householder vectors
deallocate(hh_trans_complex)
! Backtransform stage 2
ttt0 = MPI_Wtime()
call trans_ev_band_to_full_complex(na, nev, nblk, nbw, a, lda, tmat, q, ldq, mpi_comm_rows, mpi_comm_cols)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
print 1,'Time trans_ev_band_to_full_complex :',ttt1-ttt0
time_evp_back = ttt1-ttts
deallocate(tmat)
1 format(a,f10.3)
end subroutine solve_evp_complex_2stage
!-------------------------------------------------------------------------------
subroutine bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, tmat)
!-------------------------------------------------------------------------------
! bandred_real: Reduces a distributed symmetric matrix to band form
!
! Parameters
!
! na Order of matrix
!
! a(lda,*) Distributed matrix which should be reduced.
! Distribution is like in Scalapack.
! Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
! a(:,:) is overwritten on exit with the band and the Householder vectors
! in the upper half.
!
! lda Leading dimension of a
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! nbw semi bandwith of output matrix
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
!
! tmat(nbw,nbw,num_blocks) where num_blocks = (na-1)/nbw + 1
! Factors for the Householder vectors (returned), needed for back transformation
!
!-------------------------------------------------------------------------------
implicit none
integer na, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols
real*8 a(lda,*), tmat(nbw,nbw,*)
integer my_prow, my_pcol, np_rows, np_cols, mpierr
integer l_cols, l_rows
integer i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
integer istep, ncol, lch, lcx, nlc
integer tile_size, l_rows_tile, l_cols_tile
real*8 vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
real*8, allocatable:: tmp(:,:), vr(:), vmr(:,:), umc(:,:)
integer pcol, prow
pcol(i) = MOD((i-1)/nblk,np_cols) !Processor col for global col number
prow(i) = MOD((i-1)/nblk,np_rows) !Processor row for global row number
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
! Semibandwith nbw must be a multiple of blocksize nblk
if(mod(nbw,nblk)/=0) then
if(my_prow==0 .and. my_pcol==0) then
print *,'ERROR: nbw=',nbw,', nblk=',nblk
print *,'ELPA2 works only for nbw==n*nblk'
call mpi_abort(mpi_comm_world,0,mpierr)
endif
endif
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
l_rows_tile = tile_size/np_rows ! local rows of a tile
l_cols_tile = tile_size/np_cols ! local cols of a tile
do istep = (na-1)/nbw, 1, -1
n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
! Number of local columns/rows of remaining matrix
l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
allocate(vmr(max(l_rows,1),2*n_cols))
allocate(umc(max(l_cols,1),2*n_cols))
allocate(vr(l_rows+1))
vmr(1:l_rows,1:n_cols) = 0.
vr(:) = 0
tmat(:,:,istep) = 0
! Reduce current block to lower triangular form
do lc = n_cols, 1, -1
ncol = istep*nbw + lc ! absolute column number of householder vector
nrow = ncol - nbw ! Absolute number of pivot row
lr = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
tau = 0
if(nrow == 1) exit ! Nothing to do
cur_pcol = pcol(ncol) ! Processor column owning current block
if(my_pcol==cur_pcol) then
! Get vector to be transformed; distribute last element and norm of
! remaining elements to all procs in current column
vr(1:lr) = a(1:lr,lch) ! vector to be transformed
if(my_prow==prow(nrow)) then
aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
aux1(2) = vr(lr)
else
aux1(1) = dot_product(vr(1:lr),vr(1:lr))
aux1(2) = 0.
endif
call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
vnorm2 = aux2(1)
vrl = aux2(2)
! Householder transformation
call hh_transform_real(vrl, vnorm2, xf, tau)
! Scale vr and store Householder vector for back transformation
vr(1:lr) = vr(1:lr) * xf
if(my_prow==prow(nrow)) then
a(1:lr-1,lch) = vr(1:lr-1)
a(lr,lch) = vrl
vr(lr) = 1.
else
a(1:lr,lch) = vr(1:lr)
endif
endif
! Broadcast Householder vector and tau along columns
vr(lr+1) = tau
call MPI_Bcast(vr,lr+1,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
vmr(1:lr,lc) = vr(1:lr)
tau = vr(lr+1)
tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
! Transform remaining columns in current block with Householder vector
! Local dot product
aux1 = 0
nlc = 0 ! number of local columns
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if(lcx>0) then
nlc = nlc+1
if(lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
endif
enddo
! Get global dot products
if(nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if(lcx>0) then
nlc = nlc+1
a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
endif
enddo
enddo
! Calculate scalar products of stored Householder vectors.
! This can be done in different ways, we use dsyrk
vav = 0
if(l_rows>0) &
call dsyrk('U','T',n_cols,l_rows,1.d0,vmr,ubound(vmr,1),0.d0,vav,ubound(vav,1))
call symm_matrix_allreduce(n_cols,vav,ubound(vav,1),mpi_comm_rows)
! Calculate triangular matrix T for block Householder Transformation
do lc=n_cols,1,-1
tau = tmat(lc,lc,istep)
if(lc<n_cols) then
call dtrmv('U','T','N',n_cols-lc,tmat(lc+1,lc+1,istep),ubound(tmat,1),vav(lc+1,lc),1)
tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
endif
enddo
! Transpose vmr -> vmc (stored in umc, second half)
call elpa_transpose_vectors (vmr, ubound(vmr,1), mpi_comm_rows, &
umc(1,n_cols+1), ubound(umc,1), mpi_comm_cols, &
1, istep*nbw, n_cols, nblk)
! Calculate umc = A**T * vmr
! Note that the distributed A has to be transposed
! Opposed to direct tridiagonalization there is no need to use the cache locality
! of the tiles, so we can use strips of the matrix
umc(1:l_cols,1:n_cols) = 0.d0
vmr(1:l_rows,n_cols+1:2*n_cols) = 0
if(l_cols>0 .and. l_rows>0) then
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
if(lce<lcs) cycle
lre = min(l_rows,(i+1)*l_rows_tile)
call DGEMM('T','N',lce-lcs+1,n_cols,lre,1.d0,a(1,lcs),ubound(a,1), &
vmr,ubound(vmr,1),1.d0,umc(lcs,1),ubound(umc,1))
if(i==0) cycle
lre = min(l_rows,i*l_rows_tile)
call DGEMM('N','N',lre,n_cols,lce-lcs+1,1.d0,a(1,lcs),lda, &
umc(lcs,n_cols+1),ubound(umc,1),1.d0,vmr(1,n_cols+1),ubound(vmr,1))
enddo
endif
! Sum up all ur(:) parts along rows and add them to the uc(:) parts
! on the processors containing the diagonal
! This is only necessary if ur has been calculated, i.e. if the
! global tile size is smaller than the global remaining matrix
if(tile_size < istep*nbw) then
call elpa_reduce_add_vectors (vmr(1,n_cols+1),ubound(vmr,1),mpi_comm_rows, &
umc, ubound(umc,1), mpi_comm_cols, &
istep*nbw, n_cols, nblk)
endif
if(l_cols>0) then
allocate(tmp(l_cols,n_cols))
call mpi_allreduce(umc,tmp,l_cols*n_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
deallocate(tmp)
endif
! U = U * Tmat**T
call dtrmm('Right','Upper','Trans','Nonunit',l_cols,n_cols,1.d0,tmat(1,1,istep),ubound(tmat,1),umc,ubound(umc,1))
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
call dgemm('T','N',n_cols,n_cols,l_cols,1.d0,umc,ubound(umc,1),umc(1,n_cols+1),ubound(umc,1),0.d0,vav,ubound(vav,1))
call dtrmm('Right','Upper','Trans','Nonunit',n_cols,n_cols,1.d0,tmat(1,1,istep),ubound(tmat,1),vav,ubound(vav,1))
call symm_matrix_allreduce(n_cols,vav,ubound(vav,1),mpi_comm_cols)
! U = U - 0.5 * V * VAV
call dgemm('N','N',l_cols,n_cols,n_cols,-0.5d0,umc(1,n_cols+1),ubound(umc,1),vav,ubound(vav,1),1.d0,umc,ubound(umc,1))
! Transpose umc -> umr (stored in vmr, second half)
call elpa_transpose_vectors (umc, ubound(umc,1), mpi_comm_cols, &
vmr(1,n_cols+1), ubound(vmr,1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
! A = A - V*U**T - U*V**T
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)