Commit d34aec18 authored by Andreas Marek's avatar Andreas Marek

Some intendation

parent a5710519
......@@ -51,7 +51,7 @@
#include "../general/sanity.F90"
subroutine trans_ev_band_to_full_&
subroutine trans_ev_band_to_full_&
&MATH_DATATYPE&
&_&
&PRECISION &
......@@ -64,486 +64,487 @@
)
#endif
!-------------------------------------------------------------------------------
! trans_ev_band_to_full_real/complex:
! Transforms the eigenvectors of a band matrix back to the eigenvectors of the original matrix
!
! Parameters
!
! na Order of matrix a_mat, number of rows of matrix q_mat
!
! nqc Number of columns of matrix q_mat
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! nbw semi bandwith
!
! a_mat(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a_mat after bandred_real/complex)
! Distribution is like in Scalapack.
!
! lda Leading dimension of a_mat
! matrixCols local columns of matrix a_mat and q_mat
!
! tmat(nbw,nbw,numBlocks) Factors returned by bandred_real/complex
!
! q_mat On input: Eigenvectors of band matrix
! On output: Transformed eigenvectors
! Distribution is like in Scalapack.
!
! ldq Leading dimension of q_mat
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
!
!-------------------------------------------------------------------------------
use precision
use cuda_functions
use iso_c_binding
use elpa_abstract_impl
use elpa_blas_interfaces
implicit none
!-------------------------------------------------------------------------------
! trans_ev_band_to_full_real/complex:
! Transforms the eigenvectors of a band matrix back to the eigenvectors of the original matrix
!
! Parameters
!
! na Order of matrix a_mat, number of rows of matrix q_mat
!
! nqc Number of columns of matrix q_mat
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! nbw semi bandwith
!
! a_mat(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a_mat after bandred_real/complex)
! Distribution is like in Scalapack.
!
! lda Leading dimension of a_mat
! matrixCols local columns of matrix a_mat and q_mat
!
! tmat(nbw,nbw,numBlocks) Factors returned by bandred_real/complex
!
! q_mat On input: Eigenvectors of band matrix
! On output: Transformed eigenvectors
! Distribution is like in Scalapack.
!
! ldq Leading dimension of q_mat
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
!
!-------------------------------------------------------------------------------
use precision
use cuda_functions
use iso_c_binding
use elpa_abstract_impl
use elpa_blas_interfaces
implicit none
#include "../general/precision_kinds.F90"
class(elpa_abstract_impl_t), intent(inout) :: obj
logical, intent(in) :: useGPU
class(elpa_abstract_impl_t), intent(inout) :: obj
logical, intent(in) :: useGPU
#if REALCASE == 1
logical, intent(in) :: useQR
logical, intent(in) :: useQR
#endif
integer(kind=ik) :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
integer(kind=ik) :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck) :: q_mat(ldq,*), tmat(nbw,nbw,*)
MATH_DATATYPE(kind=rck) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck) :: q_mat(ldq,*), tmat(nbw,nbw,*)
#else
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck) :: q_mat(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck) :: q_mat(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
#endif
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols
integer(kind=MPI_KIND) :: my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI, mpierr
integer(kind=ik) :: max_blocks_row, max_blocks_col, max_local_rows, &
max_local_cols
integer(kind=ik) :: l_cols, l_rows, l_colh, n_cols
integer(kind=ik) :: istep, lc, ncol, nrow, nb, ns
MATH_DATATYPE(kind=rck), allocatable :: hvb(:)
MATH_DATATYPE(kind=rck), pointer :: hvm(:,:), tmp1(:), tmp2(:)
! hvm_dev is fist used and set in this routine
! q_mat is changed in trans_ev_tridi on the host, copied to device and passed here. this can be adapted
! tmp_dev is first used in this routine
! tmat_dev is not passed along from bandred_real
integer(kind=C_intptr_T) :: hvm_dev, q_dev, tmp_dev, tmat_dev
type(c_ptr) :: hvm_host, tmp1_host, tmp2_host
integer(kind=ik) :: i
MATH_DATATYPE(kind=rck), allocatable :: tmat_complete(:,:), t_tmp(:,:), t_tmp2(:,:)
integer(kind=ik) :: t_cols, t_rows
integer(kind=ik) :: cwy_blocking
integer(kind=ik) :: istat
character(200) :: errorMessage
character(20) :: gpuString
logical :: successCUDA
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
integer(kind=ik) :: blocking_factor, error
if(useGPU) then
gpuString = "_gpu"
else
gpuString = ""
endif
call obj%timer%start("trans_ev_band_to_full_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX //&
gpuString)
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols
integer(kind=MPI_KIND) :: my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI, mpierr
integer(kind=ik) :: max_blocks_row, max_blocks_col, max_local_rows, &
max_local_cols
integer(kind=ik) :: l_cols, l_rows, l_colh, n_cols
integer(kind=ik) :: istep, lc, ncol, nrow, nb, ns
MATH_DATATYPE(kind=rck), allocatable :: hvb(:)
MATH_DATATYPE(kind=rck), pointer :: hvm(:,:), tmp1(:), tmp2(:)
! hvm_dev is fist used and set in this routine
! q_mat is changed in trans_ev_tridi on the host, copied to device and passed here. this can be adapted
! tmp_dev is first used in this routine
! tmat_dev is not passed along from bandred_real
integer(kind=C_intptr_T) :: hvm_dev, q_dev, tmp_dev, tmat_dev
type(c_ptr) :: hvm_host, tmp1_host, tmp2_host
integer(kind=ik) :: i
MATH_DATATYPE(kind=rck), allocatable :: tmat_complete(:,:), t_tmp(:,:), t_tmp2(:,:)
integer(kind=ik) :: t_cols, t_rows
integer(kind=ik) :: cwy_blocking
integer(kind=ik) :: istat
character(200) :: errorMessage
character(20) :: gpuString
logical :: successCUDA
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
integer(kind=ik) :: blocking_factor, error, blk_end
if(useGPU) then
gpuString = "_gpu"
else
gpuString = ""
endif
call obj%timer%start("trans_ev_band_to_full_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX //&
gpuString)
#ifdef BAND_TO_FULL_BLOCKING
call obj%get("blocking_in_band_to_full",blocking_factor,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for blocking_in_band_to_full. Aborting..."
stop
endif
call obj%get("blocking_in_band_to_full",blocking_factor,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for blocking_in_band_to_full. Aborting..."
stop
endif
#else
blocking_factor = 1
blocking_factor = 1
#endif
call obj%timer%start("mpi_communication")
call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI ,mpierr)
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI ,mpierr)
call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI ,mpierr)
call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI ,mpierr)
my_prow = int(my_prowMPI,kind=c_int)
my_pcol = int(my_pcolMPI,kind=c_int)
np_rows = int(np_rowsMPI,kind=c_int)
np_cols = int(np_colsMPI,kind=c_int)
call obj%timer%stop("mpi_communication")
max_blocks_row = ((na -1)/nblk)/np_rows + 1 ! Rows of a_mat
max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q_mat!
max_local_rows = max_blocks_row*nblk
max_local_cols = max_blocks_col*nblk
cwy_blocking = blocking_factor * nbw
if (useGPU) then
! copy q_mat to q_dev
successCUDA = cuda_malloc(q_dev,ldq*matrixCols*size_of_datatype)
check_alloc_cuda("trans_ev_band_to_full: q_dev", successCUDA)
successCUDA = cuda_host_register(int(loc(q_mat),kind=c_intptr_t),&
ldq*matrixCols*size_of_datatype,cudaHostRegisterDefault)
check_host_register_cuda("trans_ev_band_to_full: q_mat", successCUDA)
successCUDA = cuda_memcpy(q_dev,int(loc(q_mat),kind=c_intptr_t),&
ldq*matrixCols*size_of_datatype,cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev_band_to_full: q_mat -> q_dev", successCUDA)
successCUDA = cuda_malloc_host(tmp1_host,max_local_cols*cwy_blocking*size_of_datatype)
check_host_alloc_cuda("trans_ev_band_to_full: tmp1_host", successCUDA)
call c_f_pointer(tmp1_host, tmp1, (/max_local_cols*cwy_blocking/))
successCUDA = cuda_malloc_host(tmp2_host,max_local_cols*cwy_blocking*size_of_datatype)
check_host_alloc_cuda("trans_ev_band_to_full: tmp2_host", successCUDA)
call c_f_pointer(tmp2_host, tmp2, (/max_local_cols*cwy_blocking/))
successCUDA = cuda_malloc_host(hvm_host,max_local_rows*cwy_blocking*size_of_datatype)
check_host_alloc_cuda("trans_ev_band_to_full: hvm_host", successCUDA)
call c_f_pointer(hvm_host, hvm, (/max_local_rows,cwy_blocking/))
else ! useGPU
allocate(tmp1(max_local_cols*cwy_blocking), stat=istat, errmsg=errorMessage)
check_allocate("trans_ev_band_to_full: tmp1", istat, errorMessage)
allocate(tmp2(max_local_cols*cwy_blocking), stat=istat, errmsg=errorMessage)
check_allocate("trans_ev_band_to_full: tmp2", istat, errorMessage)
call obj%timer%start("mpi_communication")
call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI ,mpierr)
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI ,mpierr)
call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI ,mpierr)
call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI ,mpierr)
my_prow = int(my_prowMPI,kind=c_int)
my_pcol = int(my_pcolMPI,kind=c_int)
np_rows = int(np_rowsMPI,kind=c_int)
np_cols = int(np_colsMPI,kind=c_int)
call obj%timer%stop("mpi_communication")
max_blocks_row = ((na -1)/nblk)/np_rows + 1 ! Rows of a_mat
max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q_mat!
max_local_rows = max_blocks_row*nblk
max_local_cols = max_blocks_col*nblk
cwy_blocking = blocking_factor * nbw
if (useGPU) then
! copy q_mat to q_dev
successCUDA = cuda_malloc(q_dev,ldq*matrixCols*size_of_datatype)
check_alloc_cuda("trans_ev_band_to_full: q_dev", successCUDA)
successCUDA = cuda_host_register(int(loc(q_mat),kind=c_intptr_t),&
ldq*matrixCols*size_of_datatype,cudaHostRegisterDefault)
check_host_register_cuda("trans_ev_band_to_full: q_mat", successCUDA)
allocate(hvm(max_local_rows,cwy_blocking), stat=istat, errmsg=errorMessage)
check_allocate("trans_ev_band_to_full: hvm", istat, errorMessage)
endif !useGPU
successCUDA = cuda_memcpy(q_dev,int(loc(q_mat),kind=c_intptr_t),&
ldq*matrixCols*size_of_datatype,cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev_band_to_full: q_mat -> q_dev", successCUDA)
allocate(hvb(max_local_rows*cwy_blocking), stat=istat, errmsg=errorMessage)
check_allocate("trans_ev_band_to_full: hvb", istat, errorMessage)
successCUDA = cuda_malloc_host(tmp1_host,max_local_cols*cwy_blocking*size_of_datatype)
check_host_alloc_cuda("trans_ev_band_to_full: tmp1_host", successCUDA)
call c_f_pointer(tmp1_host, tmp1, (/max_local_cols*cwy_blocking/))
allocate(tmat_complete(cwy_blocking,cwy_blocking), stat=istat, errmsg=errorMessage)
check_allocate("trans_ev_band_to_full: tmat_complete", istat, errorMessage)
successCUDA = cuda_malloc_host(tmp2_host,max_local_cols*cwy_blocking*size_of_datatype)
check_host_alloc_cuda("trans_ev_band_to_full: tmp2_host", successCUDA)
call c_f_pointer(tmp2_host, tmp2, (/max_local_cols*cwy_blocking/))
if (useGPU) then
successCUDA = cuda_host_register(int(loc(tmat_complete),kind=c_intptr_t), &
cwy_blocking * cwy_blocking * size_of_datatype,&
cudaHostRegisterDefault)
check_host_register_cuda("trans_ev_band_to_full: tmat_complete", successCUDA)
endif
successCUDA = cuda_malloc_host(hvm_host,max_local_rows*cwy_blocking*size_of_datatype)
check_host_alloc_cuda("trans_ev_band_to_full: hvm_host", successCUDA)
call c_f_pointer(hvm_host, hvm, (/max_local_rows,cwy_blocking/))
if (blocking_factor > 1) then
allocate(t_tmp(cwy_blocking,nbw), stat=istat, errmsg=errorMessage)
check_allocate("trans_ev_band_to_full: t_tmp", istat, errorMessage)
allocate(t_tmp2(cwy_blocking,nbw), stat=istat, errmsg=errorMessage)
check_allocate("trans_ev_band_to_full: t_tmp2", istat, errorMessage)
endif
else ! useGPU
allocate(tmp1(max_local_cols*cwy_blocking), stat=istat, errmsg=errorMessage)
check_allocate("trans_ev_band_to_full: tmp1", istat, errorMessage)
if (useGPU) then
successCUDA = cuda_malloc(hvm_dev,max_local_rows*cwy_blocking*size_of_datatype)
check_alloc_cuda("trans_ev_band_to_full: hvm_dev", successCUDA)
allocate(tmp2(max_local_cols*cwy_blocking), stat=istat, errmsg=errorMessage)
check_allocate("trans_ev_band_to_full: tmp2", istat, errorMessage)
successCUDA = cuda_malloc(tmp_dev,max_local_cols*cwy_blocking*size_of_datatype)
check_alloc_cuda("trans_ev_band_to_full: tmp_dev", successCUDA)
allocate(hvm(max_local_rows,cwy_blocking), stat=istat, errmsg=errorMessage)
check_allocate("trans_ev_band_to_full: hvm", istat, errorMessage)
endif !useGPU
successCUDA = cuda_malloc(tmat_dev,cwy_blocking*cwy_blocking*size_of_datatype)
check_alloc_cuda("trans_ev_band_to_full: tmat_dev", successCUDA)
endif
allocate(hvb(max_local_rows*cwy_blocking), stat=istat, errmsg=errorMessage)
check_allocate("trans_ev_band_to_full: hvb", istat, errorMessage)
hvm = 0.0_rck ! Must be set to 0 !!!
hvb = 0.0_rck ! Safety only
tmp1 = 0.0_rck
tmp2 = 0.0_rck
tmat_complete = 0.0_rck
if (blocking_factor > 1) then
t_tmp = 0.0_rck ! Must be set to 0 !!!
t_tmp2 = 0.0_rck
endif
l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q_mat
allocate(tmat_complete(cwy_blocking,cwy_blocking), stat=istat, errmsg=errorMessage)
check_allocate("trans_ev_band_to_full: tmat_complete", istat, errorMessage)
do istep=1,((na-1)/nbw-1)/blocking_factor + 1
if (useGPU) then
successCUDA = cuda_host_register(int(loc(tmat_complete),kind=c_intptr_t), &
cwy_blocking * cwy_blocking * size_of_datatype,&
cudaHostRegisterDefault)
check_host_register_cuda("trans_ev_band_to_full: tmat_complete", successCUDA)
endif
! This the call when using na >= ((blocking_factor+1)*nbw)
! n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw
! Number of columns in current step
! As an alternative we add some special case handling if na < cwy_blocking
if (na < cwy_blocking) then
n_cols = MAX(0, na-nbw)
if ( n_cols .eq. 0 ) then
exit
end if
else
n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step
end if
if (blocking_factor > 1) then
allocate(t_tmp(cwy_blocking,nbw), stat=istat, errmsg=errorMessage)
check_allocate("trans_ev_band_to_full: t_tmp", istat, errorMessage)
! Broadcast all Householder vectors for current step compressed in hvb
allocate(t_tmp2(cwy_blocking,nbw), stat=istat, errmsg=errorMessage)
check_allocate("trans_ev_band_to_full: t_tmp2", istat, errorMessage)
endif
nb = 0
ns = 0
if (useGPU) then
successCUDA = cuda_malloc(hvm_dev,max_local_rows*cwy_blocking*size_of_datatype)
check_alloc_cuda("trans_ev_band_to_full: hvm_dev", successCUDA)
do lc = 1, n_cols
ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder Vector
nrow = ncol - nbw ! absolute number of pivot row
successCUDA = cuda_malloc(tmp_dev,max_local_cols*cwy_blocking*size_of_datatype)
check_alloc_cuda("trans_ev_band_to_full: tmp_dev", successCUDA)
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
l_colh = local_index(ncol , my_pcol, np_cols, nblk, -1) ! HV local column number
successCUDA = cuda_malloc(tmat_dev,cwy_blocking*cwy_blocking*size_of_datatype)
check_alloc_cuda("trans_ev_band_to_full: tmat_dev", successCUDA)
endif
if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a_mat(1:l_rows,l_colh)
hvm = 0.0_rck ! Must be set to 0 !!!
hvb = 0.0_rck ! Safety only
tmp1 = 0.0_rck
tmp2 = 0.0_rck
tmat_complete = 0.0_rck
if (blocking_factor > 1) then
t_tmp = 0.0_rck ! Must be set to 0 !!!
t_tmp2 = 0.0_rck
endif
l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q_mat
nb = nb+l_rows
blk_end = ((na-1)/nbw-1)/blocking_factor + 1
do istep=1, blk_end
! This the call when using na >= ((blocking_factor+1)*nbw)
! n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw
! Number of columns in current step
! As an alternative we add some special case handling if na < cwy_blocking
if (na < cwy_blocking) then
n_cols = MAX(0, na-nbw)
if ( n_cols .eq. 0 ) then
exit
end if
else
n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step
end if
if (lc==n_cols .or. mod(ncol,nblk)==0) then
! Broadcast all Householder vectors for current step compressed in hvb
nb = 0
ns = 0
do lc = 1, n_cols
ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder Vector
nrow = ncol - nbw ! absolute number of pivot row
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
l_colh = local_index(ncol , my_pcol, np_cols, nblk, -1) ! HV local column number
if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a_mat(1:l_rows,l_colh)
nb = nb+l_rows
if (lc==n_cols .or. mod(ncol,nblk)==0) then
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call MPI_Bcast(hvb(ns+1), int(nb-ns,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION,&
int(pcol(ncol, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
call obj%timer%start("mpi_communication")
call MPI_Bcast(hvb(ns+1), int(nb-ns,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION,&
int(pcol(ncol, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
ns = nb
endif
enddo ! lc
! Expand compressed Householder vectors into matrix hvm
nb = 0
do lc = 1, n_cols
nrow = (istep-1)*cwy_blocking + lc ! absolute number of pivot row
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = 1.0_rck
nb = nb+l_rows
enddo
l_rows = local_index(MIN(na,(istep+1)*cwy_blocking), my_prow, np_rows, nblk, -1)
! compute tmat2 out of tmat(:,:,)
tmat_complete = 0
do i = 1, blocking_factor
t_cols = MIN(nbw, n_cols - (i-1)*nbw)
if (t_cols <= 0) exit
t_rows = (i - 1) * nbw
tmat_complete(t_rows+1:t_rows+t_cols,t_rows+1:t_rows+t_cols) = tmat(1:t_cols,1:t_cols,(istep-1)*blocking_factor + i)
if (i > 1) then
call obj%timer%start("blas")
call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), int(l_rows,kind=BLAS_KIND), ONE, hvm, &
int(max_local_rows,kind=BLAS_KIND), hvm(:,(i-1)*nbw+1:), &
int(max_local_rows,kind=BLAS_KIND), ZERO, t_tmp, int(cwy_blocking, kind=BLAS_KIND))
call obj%timer%stop("blas")
ns = nb
endif
enddo ! lc
! Expand compressed Householder vectors into matrix hvm
nb = 0
do lc = 1, n_cols
nrow = (istep-1)*cwy_blocking + lc ! absolute number of pivot row
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = 1.0_rck
nb = nb+l_rows
enddo
l_rows = local_index(MIN(na,(istep+1)*cwy_blocking), my_prow, np_rows, nblk, -1)
! compute tmat2 out of tmat(:,:,)
tmat_complete = 0
do i = 1, blocking_factor
t_cols = MIN(nbw, n_cols - (i-1)*nbw)
if (t_cols <= 0) exit
t_rows = (i - 1) * nbw
tmat_complete(t_rows+1:t_rows+t_cols,t_rows+1:t_rows+t_cols) = tmat(1:t_cols,1:t_cols,(istep-1)*blocking_factor + i)
if (i > 1) then
call obj%timer%start("blas")
call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), int(l_rows,kind=BLAS_KIND), ONE, hvm, &
int(max_local_rows,kind=BLAS_KIND), hvm(:,(i-1)*nbw+1:), &
int(max_local_rows,kind=BLAS_KIND), ZERO, t_tmp, int(cwy_blocking, kind=BLAS_KIND))
call obj%timer%stop("blas")
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_allreduce(t_tmp, t_tmp2, int(cwy_blocking*nbw,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
call obj%timer%start("mpi_communication")
call mpi_allreduce(t_tmp, t_tmp2, int(cwy_blocking*nbw,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
call obj%timer%start("blas")
call PRECISION_TRMM('L', 'U', 'N', 'N', int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), ONE, tmat_complete, &
int(cwy_blocking,kind=BLAS_KIND), t_tmp2, int(cwy_blocking,kind=BLAS_KIND))
call PRECISION_TRMM('R', 'U', 'N', 'N', int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), -ONE, &
tmat_complete(t_rows+1,t_rows+1), &
int(cwy_blocking,kind=BLAS_KIND), t_tmp2, int(cwy_blocking,kind=BLAS_KIND))
call obj%timer%stop("blas")
call obj%timer%start("blas")
call PRECISION_TRMM('L', 'U', 'N', 'N', int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), ONE, tmat_complete, &
int(cwy_blocking,kind=BLAS_KIND), t_tmp2, int(cwy_blocking,kind=BLAS_KIND))
call PRECISION_TRMM('R', 'U', 'N', 'N', int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), -ONE, &
tmat_complete(t_rows+1,t_rows+1), &
int(cwy_blocking,kind=BLAS_KIND), t_tmp2, int(cwy_blocking,kind=BLAS_KIND))
call obj%timer%stop("blas")
tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)
tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)
#else /* WITH_MPI */
call obj%timer%start("blas")
call PRECISION_TRMM('L', 'U', 'N', 'N', int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), ONE, tmat_complete, &
int(cwy_blocking,kind=BLAS_KIND), t_tmp, int(cwy_blocking,kind=BLAS_KIND))
call PRECISION_TRMM('R', 'U', 'N', 'N', int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), -ONE, &
tmat_complete(t_rows+1,t_rows+1), &
int(cwy_blocking,kind=BLAS_KIND), t_tmp, int(cwy_blocking,kind=BLAS_KIND))
call obj%timer%stop("blas")
call obj%timer%start("blas")
call PRECISION_TRMM('L', 'U', 'N', 'N', int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), ONE, tmat_complete, &
int(cwy_blocking,kind=BLAS_KIND), t_tmp, int(cwy_blocking,kind=BLAS_KIND))
call PRECISION_TRMM('R', 'U', 'N', 'N', int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), -ONE, &
tmat_complete(t_rows+1,t_rows+1), &
int(cwy_blocking,kind=BLAS_KIND), t_tmp, int(cwy_blocking,kind=BLAS_KIND))
call obj%timer%stop("blas")
tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp(1:t_rows,1:t_cols)
tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp(1:t_rows,1:t_cols)
#endif /* WITH_MPI */
endif
enddo
endif
enddo
! Q = Q - V * T**T * V**T * Q
! Q = Q - V * T**T * V**T * Q
if (l_rows>0) then
if (useGPU) then
successCUDA = cuda_memcpy(hvm_dev, int(loc(hvm),kind=c_intptr_t), &
max_local_rows*cwy_blocking*size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev_band_to_full: hvm -> hvm_dev", successCUDA)
if (l_rows>0) then