#if 0
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), fomerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
! This particular source code file contains additions, changes and
! enhancements authored by Intel Corporation which is not part of
! the ELPA consortium.
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
subroutine bandred_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, na, a_mat, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
wantDebug, useGPU, success, &
#if REALCASE == 1
useQR, &
#endif
max_threads)
!-------------------------------------------------------------------------------
! bandred_real/complex: Reduces a distributed symmetric matrix to band form
!
! Parameters
!
! na Order of matrix
!
! a_mat(lda,matrixCols) Distributed matrix which should be reduced.
! Distribution is like in Scalapack.
! Opposed to Scalapack, a_mat(:,:) must be set completely (upper and lower half)
! a_mat(:,:) is overwritten on exit with the band and the Householder vectors
! in the upper half.
!
! lda Leading dimension of a_mat
! matrixCols local columns of matrix a_mat
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! nbw semi bandwith of output matrix
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
!
! tmat(nbw,nbw,numBlocks) where numBlocks = (na-1)/nbw + 1
! Factors for the Householder vectors (returned), needed for back transformation
!
!-------------------------------------------------------------------------------
use cuda_functions
use iso_c_binding
use elpa1_compute
#ifdef WITH_OPENMP_TRADITIONAL
use omp_lib
#endif
use precision
use elpa_blas_interfaces
#ifdef WITH_MPI
use elpa_scalapack_interfaces
#endif
use elpa_abstract_impl
implicit none
#include "../general/precision_kinds.F90"
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck) :: tmat(nbw,nbw,*)
#else
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck) :: tmat(nbw,nbw,numBlocks)
#endif
#if REALCASE == 1
real(kind=rk) :: eps
#endif
logical, intent(in) :: useGPU
integer(kind=c_int) :: skewsymmetric
logical :: isSkewsymmetric
character(20) :: gpuString
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols
integer(kind=MPI_KIND) :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
integer(kind=ik) :: l_cols, l_rows
#if REALCASE == 1
integer(kind=ik) :: vmrCols
#endif
#ifdef WITH_OPENMP_TRADITIONAL
integer(kind=ik) :: mynlc, lrs, transformChunkSize
#endif
integer(kind=ik) :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
integer(kind=ik) :: istep, ncol, lch, lcx, nlc
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
real(kind=rk) :: vnorm2
MATH_DATATYPE(kind=rck) :: xf, aux1(nbw), aux2(nbw), vrl, tau
MATH_DATATYPE(kind=rck) :: vav(nbw,nbw)
MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:)
MATH_DATATYPE(kind=rck), pointer :: vmrCUDA(:), umcCUDA(:)
MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
MATH_DATATYPE(kind=rck), allocatable :: vr(:)
#if REALCASE == 1
! needed for blocked QR decomposition
integer(kind=ik) :: PQRPARAM(11), work_size
real(kind=rk) :: dwork_size(1)
real(kind=rk), allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)
#endif
integer(kind=C_intptr_T) :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
type(c_ptr) :: vmr_host, umc_host
#ifdef WITH_MPI
!integer(kind=ik), external :: numroc -> use elpa_scalapack
#endif
integer(kind=ik) :: ierr
integer(kind=ik) :: cur_l_rows, cur_l_cols, vmr_size, umc_size
integer(kind=ik) :: l_rows2, vmr_size2, umc_size2
integer(kind=c_intptr_t) :: lc_start, lc_end
#if COMPLEXCASE == 1
integer(kind=c_intptr_t) :: lce_1, lcs_1, lre_1
#endif
integer(kind=ik) :: lr_end
integer(kind=ik) :: na_cols
integer(kind=BLAS_KIND) :: na_colsBLAS
#if COMPLEXCASE == 1
integer(kind=ik) :: na_rows
integer(kind=BLAS_KIND) :: na_rowsBLAS
#endif
logical, intent(in) :: wantDebug
logical, intent(out) :: success
logical :: successCUDA
integer(kind=ik) :: istat
character(200) :: errorMessage
integer(kind=ik) :: min_tile_size, error
#if REALCASE == 1
logical, intent(in) :: useQR
#endif
integer(kind=ik) :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
ii, pp
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
logical :: useGPU_reduction_lower_block_to_tridiagonal
integer(kind=ik), intent(in) :: max_threads
logical :: do_memcpy
integer(kind=ik) :: i_blk,blk_off, blk_end
call obj%get("is_skewsymmetric",skewsymmetric,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for skewsymmetric settings. Aborting..."
stop
endif
isSkewsymmetric = (skewsymmetric == 1)
if(useGPU) then
gpuString = "_gpu"
else
gpuString = ""
endif
call obj%timer%start("bandred_&
&MATH_DATATYPE&
&" // &
PRECISION_SUFFIX // &
gpuString )
useGPU_reduction_lower_block_to_tridiagonal = .false.
if (useGPU) then
useGPU_reduction_lower_block_to_tridiagonal = .true.
#if REALCASE == 1
if (useQR) then
!in this case switch off GPU usage for step "reduce current block to lower triangular form"
! since this is done by QR decomposition
useGPU_reduction_lower_block_to_tridiagonal = .false.
endif
#endif
endif
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI ,mpierr)
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI ,mpierr)
call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI ,mpierr)
call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI ,mpierr)
my_prow = int(my_prowMPI,kind=c_int)
np_rows = int(np_rowsMPI,kind=c_int)
my_pcol = int(my_pcolMPI,kind=c_int)
np_cols = int(np_colsMPI,kind=c_int)
if (wantDebug) call obj%timer%stop("mpi_communication")
success = .true.
! Semibandwith nbw must be a multiple of blocksize nblk
if (mod(nbw,nblk)/=0) then
if (my_prow==0 .and. my_pcol==0) then
if (wantDebug) then
write(error_unit,*) 'ELPA2_bandred_&
&MATH_DATATYPE&
&: ERROR: nbw=',nbw,', nblk=',nblk
write(error_unit,*) 'ELPA2_bandred_&
&MATH_DATATYPE&
&: ELPA2 works only for nbw==n*nblk'
endif
success = .false.
return
endif
endif
! na_rows in used nowhere; only na_cols
if (useGPU) then
#ifdef WITH_MPI
#if COMPLEXCASE == 1
na_rowsBLAS = numroc(int(na,kind=BLAS_KIND), int(nblk,kind=BLAS_KIND), &
int(my_prow,kind=BLAS_KIND), 0_BLAS_KIND, int(np_rows,kind=BLAS_KIND))
na_rows = int(na_rowsBLAS,kind=c_int)
#endif
na_colsBLAS = numroc(int(na,kind=BLAS_KIND), int(nblk,kind=BLAS_KIND), &
int(my_pcol,kind=BLAS_KIND), 0_BLAS_KIND, int(np_cols,kind=BLAS_KIND))
na_cols = int(na_colsBLAS,kind=c_int)
#else
#if COMPLEXCASE == 1
na_rows = na
#endif
na_cols = na
#endif /* WITH_MPI */
! Here we convert the regular host array into a pinned host array
successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype)
check_alloc_cuda("bandred: a_dev", successCUDA)
successCUDA = cuda_host_register(int(loc(vav),kind=c_intptr_t), &
nbw * nbw * size_of_datatype,&
cudaHostRegisterDefault)
check_host_register_cuda("bandred: vav", successCUDA)
successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
check_alloc_cuda("bandred: vav_dev", successCUDA)
endif ! useGPU
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
! make tile_size a smallest possible multiple of previously defined tile size, such that it is
! larger or equal to min_tile_size
! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value
! it can, however, be set by the user
call obj%get("min_tile_size", min_tile_size ,error)
if (error .ne. ELPA_OK) then
print *,"Problem setting option for min_tile_size. Aborting..."
stop
endif
if(min_tile_size == 0) then
! not set by the user, use the default value
min_tile_size = 128*max(np_rows, np_cols)
endif
tile_size = ((min_tile_size-1)/tile_size+1)*tile_size
l_rows_tile = tile_size/np_rows ! local rows of a tile
l_cols_tile = tile_size/np_cols ! local cols of a tile
#if REALCASE == 1
if (useQR) then
if (which_qr_decomposition == 1) then
call qr_pqrparam_init(obj,pqrparam(1:11), nblk,'M',0, nblk,'M',0, nblk,'M',1,'s')
allocate(tauvector(na), stat=istat, errmsg=errorMessage)
check_allocate("bandred: tauvector", istat, errorMessage)
allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
check_allocate("bandred: blockheuristic", istat, errorMessage)
l_rows = local_index(na, my_prow, np_rows, nblk, -1)
allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
check_allocate("bandred: vmrCPU", istat, errorMessage)
vmrCols = na
#ifdef USE_ASSUMED_SIZE_QR
call qr_pdgeqrf_2dcomm_&
&PRECISION&
&(obj, a_mat, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
mpi_comm_rows, mpi_comm_cols, blockheuristic)
#else
call qr_pdgeqrf_2dcomm_&
&PRECISION&
&(obj, a_mat(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
mpi_comm_rows, mpi_comm_cols, blockheuristic)
#endif
work_size = int(dwork_size(1))
allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
check_allocate("bandred: work_blocked", istat, errorMessage)
work_blocked = 0.0_rk
deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
check_deallocate("bandred: vmrCPU", istat, errorMessage)
endif ! which_qr_decomposition
endif ! useQr
#endif /* REALCASE */
blk_end = (na-1)/nbw
if (useGPU) then
successCUDA = cuda_host_register(int(loc(a_mat),kind=c_intptr_t), &
lda*na_cols*size_of_datatype, cudaHostRegisterDefault)
check_host_register_cuda("bandred: a_mat", successCUDA)
cur_l_rows = 0
cur_l_cols = 0
successCUDA = cuda_memcpy(a_dev, int(loc(a_mat),kind=c_intptr_t), &
lda*na_cols*size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("bandred: a_dev", successCUDA)
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_datatype)
check_alloc_cuda("bandred: tmat_dev", successCUDA)
istep = (na-1)/nbw
blk_end = (na-1)/nbw
n_cols = min(na,(istep+1)*nbw)-istep*nbw
l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
cur_l_rows = max(l_rows,1)
cur_l_cols = max(l_cols,1)
vmr_size = cur_l_rows*2*n_cols
umc_size = cur_l_cols*2*n_cols
istep = (na-1)/nbw - 1
n_cols = min(na,(istep+1)*nbw)-istep*nbw
l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
l_rows2 = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
cur_l_rows = max(l_rows2,1)
cur_l_cols = max(l_cols,1)
vmr_size2 = cur_l_rows*2*n_cols
umc_size2 = cur_l_cols*2*n_cols
l_rows = max(l_rows,l_rows2)
vmr_size = max(vmr_size,vmr_size2)
umc_size = max(umc_size,umc_size2)
allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_&
&MATH_DATATYPE&
&: error when allocating vr "//errorMessage
stop 1
endif
successCUDA = cuda_malloc_host(vmr_host,vmr_size*size_of_datatype)
check_host_alloc_cuda("bandred: vmr_host", successCUDA)
call c_f_pointer(vmr_host, vmrCUDA, (/vmr_size/))
successCUDA = cuda_malloc(vmr_dev, vmr_size*size_of_datatype)
check_alloc_cuda("bandred: vmr_dev", successCUDA)
successCUDA = cuda_malloc_host(umc_host,umc_size*size_of_datatype)
check_host_alloc_cuda("bandred: umc_host", successCUDA)
call c_f_pointer(umc_host, umcCUDA, (/umc_size/))
successCUDA = cuda_malloc(umc_dev, umc_size*size_of_datatype)
check_alloc_cuda("bandred: umc_dev", successCUDA)
endif ! useGPU
do istep = blk_end, 1, -1
n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
! Number of local columns/rows of remaining matrix
l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
if (useGPU) then
cur_l_rows = max(l_rows, 1)
cur_l_cols = max(l_cols, 1)
vmr_size = cur_l_rows * 2 * n_cols
umc_size = cur_l_cols * 2 * n_cols
else ! GPU not used
! unify the the name vmr and vmrCPU, as well as vmrGPU
! the same for umcCPU and umcGPU
! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces
allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
check_allocate("bandred: vmrCPU", istat, errorMessage)
allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
check_allocate("bandred: umcCPU", istat, errorMessage)
allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
check_allocate("bandred: vr", istat, errorMessage)
endif ! use GPU
if (useGPU) then
vmrCUDA(1 : cur_l_rows * n_cols) = 0.0_rck
umcCUDA(1 : umc_size) = 0.0_rck
else
vmrCPU(1:l_rows,1:n_cols) = 0.0_rck
endif ! useGPU
vr(:) = 0.0_rck
tmat(:,:,istep) = 0.0_rck
if (useGPU) then
lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
lc_end = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
lr_end = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)
if (lc_start .le. 0) lc_start = 1
do_memcpy = .false.
! Note: mod(nbw,nblk) == 0
do i_blk = 1, nbw/nblk
blk_off = (i_blk-1) * nblk
cur_pcol = pcol(istep*nbw+1+blk_off, nblk, np_cols)
if (my_pcol == cur_pcol) then
do_memcpy = .true.
endif
enddo
if (do_memcpy) then
successCUDA = cuda_memcpy2d(int(loc(a_mat(1, lc_start)),kind=c_intptr_t), &
int((lda*size_of_datatype),kind=c_intptr_t), &
(a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )), &
int(lda*size_of_datatype,kind=c_intptr_t), &
int(lr_end*size_of_datatype,kind=c_intptr_t), &
int((lc_end - lc_start+1),kind=c_intptr_t),int(cudaMemcpyDeviceToHost,kind=c_int))
check_memcpy_cuda("bandred: a_dev -> a_mat", successCUDA)
endif
endif ! useGPU
! Reduce current block to lower triangular form
#if REALCASE == 1
if (useQR) then
if (which_qr_decomposition == 1) then
vmrCols = 2*n_cols
#ifdef USE_ASSUMED_SIZE_QR
call qr_pdgeqrf_2dcomm_&
&PRECISION&
&(obj, a_mat, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size, &
work_size, na, n_cols, nblk, nblk, &
istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
blockheuristic)
#else
call qr_pdgeqrf_2dcomm_&
&PRECISION&
&(obj, a_mat(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) , &
max(l_rows,1), vmrCols, tauvector(1:na), na, &
tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
work_size, na, n_cols, nblk, nblk, &
istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
blockheuristic)
#endif
endif
else !useQR
#endif /* REALCASE == 1 */
do lc = n_cols, 1, -1
ncol = istep*nbw + lc ! absolute column number of householder Vector
nrow = ncol - nbw ! Absolute number of pivot row
lr = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
tau = 0
if (nrow == 1) exit ! Nothing to do
cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
if (my_pcol==cur_pcol) then
! Get Vector to be transformed; distribute last element and norm of
! remaining elements to all procs in current column
vr(1:lr) = a_mat(1:lr,lch) ! Vector to be transformed
if (my_prow==prow(nrow, nblk, np_rows)) then
aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
aux1(2) = vr(lr)
else
aux1(1) = dot_product(vr(1:lr),vr(1:lr))
aux1(2) = 0.0_rck
endif
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_allreduce(aux1, aux2, 2_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, &
MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
aux2 = aux1 ! this should be optimized
#endif
#if REALCASE == 1
vnorm2 = aux2(1)
#endif
#if COMPLEXCASE == 1
vnorm2 = real(aux2(1),kind=rk)
#endif
vrl = aux2(2)
! Householder transformation
call hh_transform_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, vrl, vnorm2, xf, tau, wantDebug)
! Scale vr and store Householder Vector for back transformation
vr(1:lr) = vr(1:lr) * xf
if (my_prow==prow(nrow, nblk, np_rows)) then
a_mat(1:lr-1,lch) = vr(1:lr-1)
a_mat(lr,lch) = vrl
vr(lr) = 1.0_rck
else
a_mat(1:lr,lch) = vr(1:lr)
endif
endif
! Broadcast Householder Vector and tau along columns
vr(lr+1) = tau
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
call MPI_Bcast(vr, int(lr+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
int(cur_pcol,kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
if (useGPU_reduction_lower_block_to_tridiagonal) then
vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
else
vmrCPU(1:lr,lc) = vr(1:lr)
endif
tau = vr(lr+1)
#if REALCASE == 1
tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
#endif
#if COMPLEXCASE == 1
tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
#endif
! Transform remaining columns in current block with Householder Vector
! Local dot product
aux1 = 0.0_rck
#ifdef WITH_OPENMP_TRADITIONAL
#if 0
! original complex implementation without openmp. check performance
nlc = 0 ! number of local columns
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
endif
enddo
! Get global dot products
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
if (nlc>0) call mpi_allreduce(aux1, aux2, int(nlc,kind=MPI_KIND), MPI_COMPLEX_PRECISION, MPI_SUM, &
int(mpi_comm_rows,kind=MPI_KIND), mpierr)
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
endif
enddo
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
endif
enddo
#endif /* WITH_MPI */
!
! ! Transform
!
! nlc = 0
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
! endif
! enddo
#endif /* if 0 */
!Open up one omp region to avoid paying openmp overhead.
!This does not help performance due to the addition of two openmp barriers around the MPI call,
!But in the future this may be beneficial if these barriers are replaced with a faster implementation
!$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
mynlc = 0 ! number of local columns
!This loop does not have independent iterations,
!'mynlc' is incremented each iteration, and it is difficult to remove this dependency
!Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
!That is, a thread only executes the work associated with an iteration if its thread id is congruent to
!the iteration number modulo the number of threads
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0 ) then
mynlc = mynlc+1
if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
endif
endif
enddo
! Get global dot products
!$omp barrier
!$omp single
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
if (mynlc>0) call mpi_allreduce(aux1, aux2, int(mynlc,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
!$omp end single
!$omp barrier
! Transform
transformChunkSize=32
mynlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
mynlc = mynlc+1
!This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
!However, for some reason this is slower than doing it manually, so it is parallelized as below.
do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
do pp = 1,transformChunkSize
if (pp + ii > lr) exit
#if REALCASE == 1
a_mat(ii+pp,lcx) = a_mat(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
#endif
#if COMPLEXCASE == 1
a_mat(ii+pp,lcx) = a_mat(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp)
#endif
enddo
enddo
endif
enddo
!$omp end parallel
#else /* WITH_OPENMP_TRADITIONAL */
nlc = 0 ! number of local columns
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
endif
enddo
! Get global dot products
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
if (nlc>0) call mpi_allreduce(aux1, aux2, int(nlc,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
#if REALCASE == 1
a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
#endif
#if COMPLEXCASE == 1
a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
#endif
endif
enddo
#endif /* WITH_OPENMP_TRADITIONAL */
enddo ! lc
if (useGPU_reduction_lower_block_to_tridiagonal) then
! store column tiles back to GPU
if (do_memcpy) then
successCUDA = cuda_memcpy2d((a_dev+ &
int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), &
int(lda*size_of_datatype,kind=c_intptr_t), int(loc(a_mat(1,lc_start)),kind=c_intptr_t), &
int(lda*size_of_datatype,kind=c_intptr_t), &
int(lr_end*size_of_datatype,kind=c_intptr_t), &
int((lc_end - lc_start+1),kind=c_intptr_t), &
int(cudaMemcpyHostToDevice,kind=c_int))
check_memcpy_cuda("bandred: a_mat -> a_dev", successCUDA)
endif
endif
! Calculate scalar products of stored Householder vectors.
! This can be done in different ways, we use dsyrk
vav = 0
call obj%timer%start("blas")
if (useGPU_reduction_lower_block_to_tridiagonal) then
if (l_rows>0) &
#if REALCASE == 1
call PRECISION_SYRK('U', 'T', &
#endif
#if COMPLEXCASE == 1
call PRECISION_HERK('U', 'C', &
#endif
int(n_cols,kind=BLAS_KIND), int(l_rows,kind=BLAS_KIND), ONE, &
vmrCUDA, int(cur_l_rows,kind=BLAS_KIND), &
ZERO, vav, int(ubound(vav,dim=1),kind=BLAS_KIND))
else ! useGPU_reduction_to_tridiagonal
if (l_rows>0) &
#if REALCASE == 1
call PRECISION_SYRK('U', 'T', &
#endif
#if COMPLEXCASE == 1
call PRECISION_HERK('U', 'C', &
#endif
int(n_cols,kind=BLAS_KIND), int(l_rows,kind=BLAS_KIND), ONE, vmrCPU, &
int(ubound(vmrCPU,dim=1),kind=BLAS_KIND), ZERO, vav, int(ubound(vav,dim=1),kind=BLAS_KIND))
endif
call obj%timer%stop("blas")
#if REALCASE == 1
call symm_matrix_allreduce_&
#endif
#if COMPLEXCASE == 1
call herm_matrix_allreduce_&
#endif
&PRECISION &
(obj, n_cols,vav, nbw, nbw,mpi_comm_rows)
! Calculate triangular matrix T for block Householder Transformation
call obj%timer%start("blas")
do lc=n_cols,1,-1
tau = tmat(lc,lc,istep)
if (lc a_dev", successCUDA)
endif
endif
#endif
! Transpose vmr -> vmc (stored in umc, second half)
if (useGPU) then
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, vmrCUDA(:), cur_l_rows, mpi_comm_rows, &
umcCUDA(cur_l_cols * n_cols + 1:), cur_l_cols, &
mpi_comm_cols, 1, istep*nbw, n_cols, nblk, max_threads)
else ! useGPU
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, vmrCPU, ubound(vmrCPU,dim=1), mpi_comm_rows, &
umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), mpi_comm_cols, &
1, istep*nbw, n_cols, nblk, max_threads)
endif
! Calculate umc = A**T * vmr
! Note that the distributed A has to be transposed
! Opposed to direct tridiagonalization there is no need to use the cache locality
! of the tiles, so we can use strips of the matrix
#if 0
! original complex implemetation check for performance
umcCPU(1:l_cols,1:n_cols) = 0.0_rck
vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck
if (l_cols>0 .and. l_rows>0) then
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
if (lce0 .and. l_rows>0)
#endif /* if 0 */
!Code for Algorithm 4
! n_way is actually a branch for the number of OpenMP threads
n_way = 1
#ifdef WITH_OPENMP_TRADITIONAL
#if REALCASE == 1
n_way = max_threads
!$omp parallel private( i,lcs,lce,lrs,lre)
#endif
if (n_way > 1) then
#if REALCASE == 1
!$omp do
#endif
do i=1,min(l_cols_tile, l_cols)
umcCPU(i,1:n_cols) = 0.0_rck
enddo
#if REALCASE == 1
!$omp do
#endif
do i=1,l_rows
vmrCPU(i,n_cols+1:2*n_cols) = 0.0_rck
enddo
if (l_cols>0 .and. l_rows>0) then
!SYMM variant 4
!Partitioned Matrix Expression:
! Ct = Atl Bt + Atr Bb
! Cb = Atr' Bt + Abl Bb
!
!Loop invariant:
! Ct = Atl Bt + Atr Bb
!
!Update:
! C1 = A10'B0 + A11B1 + A21 B2
!
!This algorithm chosen because in this algoirhtm, the loop around the dgemm calls
!is easily parallelized, and regardless of choise of algorithm,
!the startup cost for parallelizing the dgemms inside the loop is too great
#if REALCASE == 1
!$omp do schedule(static,1)
#endif
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1 ! local column start
lce = min(l_cols, (i+1)*l_cols_tile) ! local column end
lrs = i*l_rows_tile+1 ! local row start
lre = min(l_rows, (i+1)*l_rows_tile) ! local row end
!C1 += [A11 A12] [B1
! B2]
if ( lre > lrs .and. l_cols > lcs ) then
call obj%timer%start("blas")
if (isSkewsymmetric) then
call PRECISION_GEMM('N', 'N', int(lre-lrs+1,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), &
int(l_cols-lcs+1,kind=BLAS_KIND), &
-ONE, a_mat(lrs,lcs), int(ubound(a_mat,dim=1),kind=BLAS_KIND), &
umcCPU(lcs,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND), &
ZERO, vmrCPU(lrs,n_cols+1), int(ubound(vmrCPU,dim=1),kind=BLAS_KIND) )
else
call PRECISION_GEMM('N', 'N', int(lre-lrs+1,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), &
int(l_cols-lcs+1,kind=BLAS_KIND), &
ONE, a_mat(lrs,lcs), int(ubound(a_mat,dim=1),kind=BLAS_KIND), &
umcCPU(lcs,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND), &
ZERO, vmrCPU(lrs,n_cols+1), int(ubound(vmrCPU,dim=1),kind=BLAS_KIND) )
endif
call obj%timer%stop("blas")
endif
! C1 += A10' B0
if ( lce > lcs .and. i > 0 ) then
call obj%timer%start("blas")
call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
int(lce-lcs+1,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), int(lrs-1,kind=BLAS_KIND), &
ONE, a_mat(1,lcs), int(ubound(a_mat,dim=1),kind=BLAS_KIND), &
vmrCPU(1,1), int(ubound(vmrCPU,dim=1),kind=BLAS_KIND), &
ZERO, umcCPU(lcs,1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND) )
call obj%timer%stop("blas")
endif
enddo
endif ! l_cols>0 .and. l_rows>0
else ! n_way > 1
#endif /* WITH_OPENMP_TRADITIONAL */
if (.not. useGPU) then
umcCPU(1:l_cols,1:n_cols) = 0.0_rck
vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck
endif ! useGPU
if (l_cols>0 .and. l_rows>0) then
if (useGPU) then
successCUDA = cuda_memset(vmr_dev+cur_l_rows*n_cols*size_of_datatype, &
0, cur_l_rows*n_cols*size_of_datatype)
check_memset_cuda("bandred: vmr_dev", successCUDA)
successCUDA = cuda_memcpy(vmr_dev, int(loc(vmrCUDA(1)),kind=c_intptr_t), &
cur_l_rows*n_cols*size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("bandred: vmrCUDA -> vmr_dev", successCUDA)
successCUDA = cuda_memset(umc_dev, 0, l_cols*n_cols*size_of_datatype)
check_memset_cuda("bandred: umc_dev", successCUDA)
successCUDA = cuda_memcpy(umc_dev+l_cols*n_cols*size_of_datatype, &
int(loc(umcCUDA(1+l_cols*n_cols)),kind=c_intptr_t), &
(umc_size-l_cols*n_cols)*size_of_datatype, &
cudaMemcpyHostToDevice)
check_memcpy_cuda("bandred: umcCUDA -> umc_dev", successCUDA)
endif ! useGPU
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
if (lce 1) then
successCUDA = cuda_memcpy(int(loc(vmrCUDA(1+cur_l_rows*n_cols)),kind=c_intptr_t), &
vmr_dev+cur_l_rows*n_cols*size_of_datatype, &
(vmr_size-cur_l_rows*n_cols)*size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("bandred: vmr_dev -> vmrCUDA", successCUDA)
endif
successCUDA = cuda_memcpy(int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_dev, l_cols*n_cols*size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("bandred: umc_dev -> umcCUDA", successCUDA)
endif ! useGPU
endif ! l_cols>0 .and. l_rows>0
#ifdef WITH_OPENMP_TRADITIONAL
endif ! n_way > 1
#if REALCASE == 1
!$omp end parallel
#endif
#endif
! Sum up all ur(:) parts along rows and add them to the uc(:) parts
! on the processors containing the diagonal
! This is only necessary if ur has been calculated, i.e. if the
! global tile size is smaller than the global remaining matrix
! Or if we used the Algorithm 4
if (tile_size < istep*nbw .or. n_way > 1) then
if (useGPU) then
call elpa_reduce_add_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, vmrCUDA(cur_l_rows * n_cols + 1:),cur_l_rows, &
mpi_comm_rows, umcCUDA, &
cur_l_cols, mpi_comm_cols, istep*nbw, n_cols, nblk, max_threads)
else ! useGPU
call elpa_reduce_add_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, &
umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
istep*nbw, n_cols, nblk, max_threads)
endif ! useGPU
endif ! tile_size < istep*nbw .or. n_way > 1
if (l_cols>0) then
if (useGPU) then
#ifdef WITH_MPI
allocate(tmpCUDA(l_cols * n_cols), stat=istat, errmsg=errorMessage)
check_allocate("bandred: tmpCUDA", istat, errorMessage)
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_allreduce(umcCUDA, tmpCUDA, int(l_cols*n_cols,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), ierr)
umcCUDA(1 : l_cols * n_cols) = tmpCUDA(1 : l_cols * n_cols)
if (wantDebug) call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
if (allocated(tmpCUDA)) then
deallocate(tmpCUDA, stat=istat, errmsg=errorMessage)
check_deallocate("bandred: tmpCUDA", istat, errorMessage)
endif
else ! useGPU
allocate(tmpCPU(l_cols,n_cols), stat=istat, errmsg=errorMessage)
check_allocate("bandred: tmpCPU", istat, errorMessage)
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_allreduce(umcCPU, tmpCPU, int(l_cols*n_cols,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
umcCPU(1:l_cols,1:n_cols) = tmpCPU(1:l_cols,1:n_cols)
if (wantDebug) call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
deallocate(tmpCPU, stat=istat, errmsg=errorMessage)
check_deallocate("bandred: tmpCPU", istat, errorMessage)
endif ! useGPU
endif ! l_cols > 0
! U = U * Tmat**T
if (useGPU) then
successCUDA = cuda_memcpy(umc_dev, int(loc(umcCUDA(1)),kind=c_intptr_t), &
l_cols*n_cols*size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("bandred: umcCUDA -> umc_dev ", successCUDA)
successCUDA = cuda_memcpy(tmat_dev,int(loc(tmat(1,1,istep)),kind=c_intptr_t), &
nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
check_memcpy_cuda("bandred: tmat -> tmat_dev ", successCUDA)
call obj%timer%start("cublas")
call cublas_PRECISION_TRMM('Right', 'Upper', BLAS_TRANS_OR_CONJ, 'Nonunit', &
l_cols, n_cols, ONE, tmat_dev, nbw, umc_dev, cur_l_cols)
call obj%timer%stop("cublas")
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
call obj%timer%start("cublas")
call cublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
n_cols, n_cols, l_cols, ONE, umc_dev, cur_l_cols, &
(umc_dev+(cur_l_cols * n_cols )*size_of_datatype),cur_l_cols, &
ZERO, vav_dev, nbw)
call cublas_PRECISION_TRMM('Right', 'Upper', BLAS_TRANS_OR_CONJ, 'Nonunit', &
n_cols, n_cols, ONE, tmat_dev, nbw, vav_dev, nbw)
call obj%timer%stop("cublas")
successCUDA = cuda_memcpy(int(loc(vav),kind=c_intptr_t), &
vav_dev, nbw*nbw*size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("bandred: vav_dev -> vav ", successCUDA)
else ! useGPU
call obj%timer%start("blas")
call PRECISION_TRMM('Right', 'Upper', BLAS_TRANS_OR_CONJ, 'Nonunit', &
int(l_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), ONE, tmat(1,1,istep), &
int(ubound(tmat,dim=1),kind=BLAS_KIND), &
umcCPU, int(ubound(umcCPU,dim=1),kind=BLAS_KIND))
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
int(n_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), &
ONE, umcCPU, int(ubound(umcCPU,dim=1),kind=BLAS_KIND), umcCPU(1,n_cols+1), &
int(ubound(umcCPU,dim=1),kind=BLAs_KIND), ZERO, vav, int(ubound(vav,dim=1),kind=BLAS_KIND))
call PRECISION_TRMM('Right', 'Upper', BLAS_TRANS_OR_CONJ, 'Nonunit', &
int(n_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), ONE, tmat(1,1,istep), &
int(ubound(tmat,dim=1),kind=BLAS_KIND), vav, int(ubound(vav,dim=1),kind=BLAS_KIND) )
call obj%timer%stop("blas")
endif ! useGPU
#if REALCASE == 1
#ifdef HAVE_SKEWSYMMETRIC
if (isSkewsymmetric) then
call ssymm_matrix_allreduce_&
&PRECISION &
(obj, n_cols,vav, nbw, nbw ,mpi_comm_cols)
else
#endif
call symm_matrix_allreduce_&
&PRECISION &
(obj, n_cols,vav, nbw, nbw ,mpi_comm_cols)
#ifdef HAVE_SKEWSYMMETRIC
endif
#endif
#endif /* REALCASE */
#if COMPLEXCASE == 1
call herm_matrix_allreduce_&
&PRECISION &
(obj, n_cols,vav, nbw, nbw ,mpi_comm_cols)
#endif
if (useGPU) then
successCUDA = cuda_memcpy(vav_dev, int(loc(vav),kind=c_intptr_t), &
nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
check_memcpy_cuda("bandred: vav -> vav_dev ", successCUDA)
endif
! U = U - 0.5 * V * VAV
if (useGPU) then
call obj%timer%start("cublas")
if (isSkewsymmetric) then
call cublas_PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols,&
#if REALCASE == 1
0.5_rk, &
#endif
#if COMPLEXCASE == 1
(0.5_rk, 0.0_rk), &
#endif
(umc_dev+(cur_l_cols * n_cols )* &
size_of_datatype), &
cur_l_cols, vav_dev,nbw, &
ONE, umc_dev, cur_l_cols)
else
call cublas_PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols,&
#if REALCASE == 1
-0.5_rk, &
#endif
#if COMPLEXCASE == 1
(-0.5_rk, 0.0_rk), &
#endif
(umc_dev+(cur_l_cols * n_cols )* &
size_of_datatype), &
cur_l_cols, vav_dev,nbw, &
ONE, umc_dev, cur_l_cols)
endif
call obj%timer%stop("cublas")
successCUDA = cuda_memcpy(int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_dev, umc_size*size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("bandred: umc_dev -> umcCUDA ", successCUDA)
! Transpose umc -> umr (stored in vmr, second half)
if (isSkewsymmetric) then
call elpa_transpose_vectors_ss_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, umcCUDA(:), cur_l_cols, mpi_comm_cols, &
vmrCUDA(cur_l_rows * n_cols + 1:), cur_l_rows, mpi_comm_rows, &
1, istep*nbw, n_cols, nblk, max_threads)
else
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, umcCUDA, cur_l_cols, mpi_comm_cols, &
vmrCUDA(cur_l_rows * n_cols + 1:), cur_l_rows, mpi_comm_rows, &
1, istep*nbw, n_cols, nblk, max_threads)
endif
successCUDA = cuda_memcpy(vmr_dev+cur_l_rows*n_cols*size_of_datatype, &
int(loc(vmrCUDA(1+cur_l_rows*n_cols)),kind=c_intptr_t), &
(vmr_size-cur_l_rows*n_cols)*size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("bandred: vmr -> vmrCUDA ", successCUDA)
else ! useGPU
call obj%timer%start("blas")
#if REALCASE == 1
if (isSkewsymmetric) then
call PRECISION_GEMM('N', 'N', int(l_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), &
0.5_rk, umcCPU(1,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND), vav, &
int(ubound(vav,dim=1),kind=BLAS_KIND), ONE, umcCPU, int(ubound(umcCPU,dim=1),kind=BLAS_KIND) )
else
call PRECISION_GEMM('N', 'N', int(l_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), &
-0.5_rk, umcCPU(1,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND), vav, &
int(ubound(vav,dim=1),kind=BLAS_KIND), ONE, umcCPU, int(ubound(umcCPU,dim=1),kind=BLAS_KIND) )
endif
#endif
#if COMPLEXCASE == 1
call PRECISION_GEMM('N', 'N', int(l_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), &
(-0.5_rk, 0.0_rk), &
umcCPU(1,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND), vav, &
int(ubound(vav,dim=1),kind=BLAS_KIND), ONE, umcCPU, int(ubound(umcCPU,dim=1),kind=BLAS_KIND))
#endif
call obj%timer%stop("blas")
! Transpose umc -> umr (stored in vmr, second half)
if (isSkewsymmetric) then
call elpa_transpose_vectors_ss_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk, max_threads)
else
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk, max_threads)
endif
endif ! useGPU
! A = A - V*U**T - U*V**T
#ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend )
n_threads = omp_get_num_threads()
if (mod(n_threads, 2) == 0) then
n_way = 2
else
n_way = 1
endif
m_way = n_threads / n_way
m_id = mod(omp_get_thread_num(), m_way)
n_id = omp_get_thread_num() / m_way
do ii=n_id*tile_size,(istep*nbw-1),tile_size*n_way
i = ii / tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
lre = min(l_rows,(i+1)*l_rows_tile)
if (lce lre ) myend = lre
if ( myend-mystart+1 < 1) cycle
call obj%timer%start("blas")
call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, int(myend-mystart+1,kind=BLAS_KIND), &
int(lce-lcs+1,kind=BLAS_KIND), int(2*n_cols,kind=BLAS_KIND), -ONE, &
vmrCPU(mystart, 1), int(ubound(vmrCPU,1),kind=BLAS_KIND), &
umcCPU(lcs,1), int(ubound(umcCPU,1),kind=BLAS_KIND), &
ONE, a_mat(mystart,lcs), int(ubound(a_mat,1),kind=BLAS_KIND) )
call obj%timer%stop("blas")
enddo
!$omp end parallel
#else /* WITH_OPENMP_TRADITIONAL */
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
lre = min(l_rows,(i+1)*l_rows_tile)
if (lce a_mat ", successCUDA)
successCUDA = cuda_host_unregister(int(loc(a_mat),kind=c_intptr_t))
check_host_unregister_cuda("bandred: a_mat ", successCUDA)
successCUDA = cuda_free(a_dev)
check_dealloc_cuda("bandred: a_dev ", successCUDA)
successCUDA = cuda_free(vav_dev)
check_dealloc_cuda("bandred: vav_dev ", successCUDA)
successCUDA = cuda_free(tmat_dev)
check_dealloc_cuda("bandred: tmat_dev ", successCUDA)
successCUDA = cuda_host_unregister(int(loc(vav),kind=c_intptr_t))
check_host_unregister_cuda("bandred: vav", successCUDA)
if (associated(umcCUDA)) then
nullify(umcCUDA)
successCUDA = cuda_free_host(umc_host)
check_host_dealloc_cuda("bandred: umc_host ", successCUDA)
successCUDA = cuda_free(umc_dev)
check_dealloc_cuda("bandred: umc_dev ", successCUDA)
endif
if (associated(vmrCUDA)) then
nullify(vmrCUDA)
successCUDA = cuda_free_host(vmr_host)
check_host_dealloc_cuda("bandred: vmr_host ", successCUDA)
successCUDA = cuda_free(vmr_dev)
check_dealloc_cuda("bandred: vmr_dev ", successCUDA)
endif
endif ! useGPU
if (allocated(vr)) then
deallocate(vr, stat=istat, errmsg=errorMessage)
check_deallocate("bandred: vr", istat, errorMessage)
endif
if (allocated(umcCPU)) then
deallocate(umcCPU, stat=istat, errmsg=errorMessage)
check_deallocate("bandred: umcCPU", istat, errorMessage)
endif
if (allocated(vmrCPU)) then
deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
check_deallocate("bandred: vmrCPU", istat, errorMessage)
endif
#if REALCASE == 1
if (useQR) then
if (which_qr_decomposition == 1) then
deallocate(work_blocked, stat=istat, errmsg=errorMessage)
check_deallocate("bandred: work_blocked", istat, errorMessage)
deallocate(tauvector, stat=istat, errmsg=errorMessage)
check_deallocate("bandred: tauvector", istat, errorMessage)
endif
endif
#endif
call obj%timer%stop("bandred_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX //&
gpuString)
end subroutine bandred_&
&MATH_DATATYPE&
&_&
&PRECISION