#if 0
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), fomerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
! This particular source code file contains additions, changes and
! enhancements authored by Intel Corporation which is not part of
! the ELPA consortium.
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
subroutine M_bandred_real_PRECISSION(na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
tmat, tmat_dev, wantDebug, useGPU, success, useQR)
!-------------------------------------------------------------------------------
! bandred_real: Reduces a distributed symmetric matrix to band form
!
! Parameters
!
! na Order of matrix
!
! a(lda,matrixCols) Distributed matrix which should be reduced.
! Distribution is like in Scalapack.
! Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
! a(:,:) is overwritten on exit with the band and the Householder vectors
! in the upper half.
!
! lda Leading dimension of a
! matrixCols local columns of matrix a
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! nbw semi bandwith of output matrix
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
!
! tmat(nbw,nbw,numBlocks) where numBlocks = (na-1)/nbw + 1
! Factors for the Householder vectors (returned), needed for back transformation
!
!-------------------------------------------------------------------------------
use cuda_functions
use iso_c_binding
use elpa1_compute
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
#ifdef WITH_OPENMP
use omp_lib
#endif
use precision
implicit none
integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE) :: a(lda,*), tmat(nbw,nbw,*)
#else
real(kind=REAL_DATATYPE) :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
#endif
real(kind=REAL_DATATYPE) :: eps
logical, intent(in) :: useGPU
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows, vmrCols
integer(kind=ik) :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow
integer(kind=ik) :: istep, ncol, lch, lcx, nlc, mynlc
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
real(kind=REAL_DATATYPE) :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
real(kind=REAL_DATATYPE), allocatable :: tmpCUDA(:), vmrCUDA(:), umcCUDA(:)
real(kind=REAL_DATATYPE), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
real(kind=REAL_DATATYPE), allocatable :: vr(:)
! needed for blocked QR decomposition
integer(kind=ik) :: PQRPARAM(11), work_size
real(kind=REAL_DATATYPE) :: dwork_size(1)
real(kind=REAL_DATATYPE), allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)
! a_dev is passed from bandred_real to trans_ev_band
integer(kind=C_intptr_T) :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
#ifdef WITH_MPI
integer(kind=ik), external :: numroc
#endif
integer(kind=ik) :: ierr
integer(kind=ik) :: cur_l_rows, cur_l_cols, vmr_size, umc_size
integer(kind=c_size_t) :: lc_start, lc_end
integer(kind=ik) :: lr_end
integer(kind=ik) :: na_cols !, na_rows
logical, intent(in) :: wantDebug
logical, intent(out) :: success
logical :: successCUDA
integer(kind=ik) :: istat
character(200) :: errorMessage
logical, intent(in) :: useQR
integer(kind=ik) :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
ii, pp, transformChunkSize
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("bandred_real" + M_PRECISSION_SUFFIX)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
success = .true.
! Semibandwith nbw must be a multiple of blocksize nblk
if (mod(nbw,nblk)/=0) then
if (my_prow==0 .and. my_pcol==0) then
if (wantDebug) then
write(error_unit,*) 'ELPA2_bandred_real: ERROR: nbw=',nbw,', nblk=',nblk
write(error_unit,*) 'ELPA2_bandred_real: ELPA2 works only for nbw==n*nblk'
endif
success = .false.
return
endif
endif
! na_rows in used nowhere; only na_cols
if (useGPU) then
#ifdef WITH_MPI
! na_rows = numroc(na, nblk, my_prow, 0, np_rows)
na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
! na_rows = na
na_cols = na
#endif
endif ! useGPU
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
l_rows_tile = tile_size/np_rows ! local rows of a tile
l_cols_tile = tile_size/np_cols ! local cols of a tile
if (useQR) then
if (useGPU) then
print *,"qr decomposition at the moment not supported with GPU"
stop
endif
if (which_qr_decomposition == 1) then
call qr_pqrparam_init(pqrparam(1:11), nblk,'M',0, nblk,'M',0, nblk,'M',1,'s')
allocate(tauvector(na), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating tauvector "//errorMessage
stop
endif
allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating blockheuristic "//errorMessage
stop
endif
l_rows = local_index(na, my_prow, np_rows, nblk, -1)
allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating vmrCPU "//errorMessage
stop
endif
vmrCols = na
#ifdef USE_ASSUMED_SIZE_QR
call M_qr_pdgeqrf_2dcomm_PRECISSION(a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
mpi_comm_rows, mpi_comm_cols, blockheuristic)
#else
call M_qr_pdgeqrf_2dcomm_PRECISSION(a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
mpi_comm_rows, mpi_comm_cols, blockheuristic)
#endif
work_size = dwork_size(1)
allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating work_blocked "//errorMessage
stop
endif
work_blocked = M_CONST_0_0
deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
stop
endif
endif ! which_qr_decomposition
endif ! useQr
if (useGPU) then
! Here we convert the regular host array into a pinned host array
successCUDA = cuda_malloc(a_dev, lda*na_cols*M_size_of_PRECISSION_real)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMalloc"
stop
endif
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*M_size_of_PRECISSION_real)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMalloc"
stop
endif
successCUDA = cuda_malloc(vav_dev, nbw*nbw*M_size_of_PRECISSION_real)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMalloc"
stop
endif
cur_l_rows = 0
cur_l_cols = 0
successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)*M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
endif ! useGPU
do istep = (na-1)/nbw, 1, -1
n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
! Number of local columns/rows of remaining matrix
l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
if (useGPU) then
cur_l_rows = max(l_rows, 1)
cur_l_cols = max(l_cols, 1)
vmr_size = cur_l_rows * 2 * n_cols
umc_size = cur_l_cols * 2 * n_cols
! Allocate vmr and umc only if the inew size exceeds their current capacity
! Added for FORTRAN CALLS
if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
if (allocated(vr)) then
deallocate(vr, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when deallocating vr "//errorMessage
stop
endif
endif
allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating vr "//errorMessage
stop
endif
endif
if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then
if (allocated(vmrCUDA)) then
deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating vmrCUDA "//errorMessage
stop
endif
successCUDA = cuda_free(vmr_dev)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cuda_free"
stop
endif
endif
allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating vmrCUDA "//errorMessage
stop
endif
successCUDA = cuda_malloc(vmr_dev, vmr_size*M_size_of_PRECISSION_real)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMalloc"
stop
endif
endif
if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then
if (allocated(umcCUDA)) then
deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when deallocating umcCUDA "//errorMessage
stop
endif
successCUDA = cuda_free(umc_dev)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaFree"
stop
endif
endif
allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when deallocating umcCUDA "//errorMessage
stop
endif
successCUDA = cuda_malloc(umc_dev, umc_size*M_size_of_PRECISSION_real)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMalloc"
stop
endif
endif
else ! GPU not used
! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating vmrCPU "//errorMessage
stop
endif
allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating umcCPU "//errorMessage
stop
endif
allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating vr "//errorMessage
stop
endif
endif ! use GPU
if (useGPU) then
vmrCUDA(1 : cur_l_rows * n_cols) = M_CONST_0_0
else
vmrCPU(1:l_rows,1:n_cols) = M_CONST_0_0
endif
vr(:) = M_CONST_0_0
tmat(:,:,istep) = M_CONST_0_0
if (useGPU) then
umcCUDA(1 : umc_size) = M_CONST_0_0
lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
lc_end = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
lr_end = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)
if(lc_start .le. 0) lc_start = 1
! Here we assume that the processor grid and the block grid are aligned
cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if(my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), lda*M_size_of_PRECISSION_real, &
(a_dev + ((lc_start-1) * lda*M_size_of_PRECISSION_real)), &
lda*M_size_of_PRECISSION_real, lr_end*M_size_of_PRECISSION_real, &
(lc_end - lc_start+1), cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy2d"
stop
endif
endif
endif ! useGPU
! Reduce current block to lower triangular form
if (useQR) then
if (which_qr_decomposition == 1) then
vmrCols = 2*n_cols
#ifdef USE_ASSUMED_SIZE_QR
call M_qr_pdgeqrf_2dcomm_PRECISSION(a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size, &
work_size, na, n_cols, nblk, nblk, &
istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
blockheuristic)
#else
call M_qr_pdgeqrf_2dcomm_PRECISSION(a(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) , &
max(l_rows,1), vmrCols, tauvector(1:na), na, &
tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
work_size, na, n_cols, nblk, nblk, &
istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
blockheuristic)
#endif
endif
else !useQR
do lc = n_cols, 1, -1
ncol = istep*nbw + lc ! absolute column number of householder vector
nrow = ncol - nbw ! Absolute number of pivot row
lr = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
tau = 0
if (nrow == 1) exit ! Nothing to do
cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
if (my_pcol==cur_pcol) then
! Get vector to be transformed; distribute last element and norm of
! remaining elements to all procs in current column
vr(1:lr) = a(1:lr,lch) ! vector to be transformed
if (my_prow==prow(nrow, nblk, np_rows)) then
aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
aux1(2) = vr(lr)
else
aux1(1) = dot_product(vr(1:lr),vr(1:lr))
aux1(2) = M_CONST_0_0
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_allreduce(aux1, aux2, 2, M_MPI_REAL_PRECISSION, MPI_SUM, mpi_comm_rows, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
aux2 = aux1 ! this should be optimized
#endif
vnorm2 = aux2(1)
vrl = aux2(2)
! Householder transformation
call M_hh_transform_real_PRECISSION(vrl, vnorm2, xf, tau)
! Scale vr and store Householder vector for back transformation
vr(1:lr) = vr(1:lr) * xf
if (my_prow==prow(nrow, nblk, np_rows)) then
a(1:lr-1,lch) = vr(1:lr-1)
a(lr,lch) = vrl
vr(lr) = M_CONST_1_0
else
a(1:lr,lch) = vr(1:lr)
endif
endif
! Broadcast Householder vector and tau along columns
vr(lr+1) = tau
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Bcast(vr, lr+1, M_MPI_REAL_PRECISSION, cur_pcol, mpi_comm_cols, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
if (useGPU) then
vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
else
vmrCPU(1:lr,lc) = vr(1:lr)
endif
tau = vr(lr+1)
tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
! Transform remaining columns in current block with Householder vector
! Local dot product
aux1 = 0
#ifdef WITH_OPENMP
!Open up one omp region to avoid paying openmp overhead.
!This does not help performance due to the addition of two openmp barriers around the MPI call,
!But in the future this may be beneficial if these barriers are replaced with a faster implementation
!$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
mynlc = 0 ! number of local columns
!This loop does not have independent iterations,
!'mynlc' is incremented each iteration, and it is difficult to remove this dependency
!Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
!That is, a thread only executes the work associated with an iteration if its thread id is congruent to
!the iteration number modulo the number of threads
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0 ) then
mynlc = mynlc+1
if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
endif
endif
enddo
! Get global dot products
!$omp barrier
!$omp single
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, M_MPI_REAL_PRECISSION, MPI_SUM, mpi_comm_rows, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
!$omp end single
!$omp barrier
! Transform
transformChunkSize=32
mynlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
mynlc = mynlc+1
!This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
!However, for some reason this is slower than doing it manually, so it is parallelized as below.
do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
do pp = 1,transformChunkSize
if (pp + ii > lr) exit
a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
enddo
enddo
endif
enddo
!$omp end parallel
#else /* WITH_OPENMP */
nlc = 0 ! number of local columns
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
endif
enddo
! Get global dot products
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, M_MPI_REAL_PRECISSION, MPI_SUM, mpi_comm_rows, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
endif
enddo
#endif /* WITH_OPENMP */
enddo ! lc
if (useGPU) then
! store column tiles back to GPU
cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d((a_dev+((lc_start-1)*lda*M_size_of_PRECISSION_real)), &
lda*M_size_of_PRECISSION_real, loc(a(1, lc_start)), &
lda*M_size_of_PRECISSION_real, lr_end*M_size_of_PRECISSION_real, &
(lc_end - lc_start+1),cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy2d"
stop
endif
endif
endif
! Calculate scalar products of stored Householder vectors.
! This can be done in different ways, we use dsyrk
vav = 0
if (useGPU) then
if (l_rows>0) &
call M_PRECISSION_SYRK('U', 'T', n_cols, l_rows, M_CONST_1_0, vmrCUDA, cur_l_rows, M_CONST_0_0, vav, ubound(vav,dim=1))
else
if (l_rows>0) &
call M_PRECISSION_SYRK('U', 'T', n_cols, l_rows, M_CONST_1_0, vmrCPU, ubound(vmrCPU,dim=1), M_CONST_0_0, vav, ubound(vav,dim=1))
endif
call M_symm_matrix_allreduce_PRECISSION(n_cols,vav, nbw, nbw,mpi_comm_rows)
! Calculate triangular matrix T for block Householder Transformation
do lc=n_cols,1,-1
tau = tmat(lc,lc,istep)
if (lc vmc (stored in umc, second half)
if (useGPU) then
call M_elpa_transpose_vectors_real_PRECISSION (vmrCUDA, cur_l_rows, mpi_comm_rows, &
umcCUDA(cur_l_cols * n_cols + 1), cur_l_cols, mpi_comm_cols, &
1, istep*nbw, n_cols, nblk)
else
call M_elpa_transpose_vectors_real_PRECISSION (vmrCPU, ubound(vmrCPU,dim=1), mpi_comm_rows, &
umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), mpi_comm_cols, &
1, istep*nbw, n_cols, nblk)
endif
! Calculate umc = A**T * vmr
! Note that the distributed A has to be transposed
! Opposed to direct tridiagonalization there is no need to use the cache locality
! of the tiles, so we can use strips of the matrix
! here the GPU version and CPU version diverged substantially, due to the newest
! optimizations due to Intel. The GPU version has to be re-written
if (useGPU) then
umcCUDA(1 : l_cols * n_cols) = M_CONST_0_0
vmrCUDA(cur_l_rows * n_cols + 1 : cur_l_rows * n_cols * 2) = M_CONST_0_0
if (l_cols>0 .and. l_rows>0) then
successCUDA = cuda_memcpy(vmr_dev, loc(vmrCUDA(1)), vmr_size*M_size_of_PRECISSION_real,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1)), umc_size*M_size_of_PRECISSION_real,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
if (lce0 .and. l_rows>0
else ! do not useGPU version
!Code for Algorithm 4
n_way = 1
#ifdef WITH_OPENMP
n_way = omp_get_max_threads()
#endif
!umc(1:l_cols,1:n_cols) = 0.d0
!vmr(1:l_rows,n_cols+1:2*n_cols) = 0
#ifdef WITH_OPENMP
!$omp parallel private( i,lcs,lce,lrs,lre)
#endif
if (n_way > 1) then
!$omp do
do i=1,min(l_cols_tile, l_cols)
umcCPU(i,1:n_cols) = M_CONST_0_0
enddo
!$omp do
do i=1,l_rows
vmrCPU(i,n_cols+1:2*n_cols) = M_CONST_0_0
enddo
if (l_cols>0 .and. l_rows>0) then
!SYMM variant 4
!Partitioned Matrix Expression:
! Ct = Atl Bt + Atr Bb
! Cb = Atr' Bt + Abl Bb
!
!Loop invariant:
! Ct = Atl Bt + Atr Bb
!
!Update:
! C1 = A10'B0 + A11B1 + A21 B2
!
!This algorithm chosen because in this algoirhtm, the loop around the dgemm calls
!is easily parallelized, and regardless of choise of algorithm,
!the startup cost for parallelizing the dgemms inside the loop is too great
!$omp do schedule(static,1)
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1 ! local column start
lce = min(l_cols, (i+1)*l_cols_tile) ! local column end
lrs = i*l_rows_tile+1 ! local row start
lre = min(l_rows, (i+1)*l_rows_tile) ! local row end
!C1 += [A11 A12] [B1
! B2]
if ( lre > lrs .and. l_cols > lcs ) then
call M_PRECISSION_GEMM('N', 'N', lre-lrs+1, n_cols, l_cols-lcs+1, &
M_CONST_1_0, a(lrs,lcs), ubound(a,dim=1), &
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), &
M_CONST_0_0, vmrCPU(lrs,n_cols+1), ubound(vmrCPU,dim=1))
endif
! C1 += A10' B0
if ( lce > lcs .and. i > 0 ) then
call M_PRECISSION_GEMM('T', 'N', lce-lcs+1, n_cols, lrs-1, &
M_CONST_1_0, a(1,lcs), ubound(a,dim=1), &
vmrCPU(1,1), ubound(vmrCPU,dim=1), &
M_CONST_0_0, umcCPU(lcs,1), ubound(umcCPU,dim=1))
endif
enddo
endif ! l_cols>0 .and. l_rows>0
else ! n_way > 1
umcCPU(1:l_cols,1:n_cols) = M_CONST_0_0
vmrCPU(1:l_rows,n_cols+1:2*n_cols) = M_CONST_0_0
if (l_cols>0 .and. l_rows>0) then
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
if (lce 1
#ifdef WITH_OPENMP
!$omp end parallel
#endif
endif ! do not useGPU version
! Sum up all ur(:) parts along rows and add them to the uc(:) parts
! on the processors containing the diagonal
! This is only necessary if ur has been calculated, i.e. if the
! global tile size is smaller than the global remaining matrix
if (useGPU) then
! here the GPU version and CPU version divereged due to the same reasons as above
if (tile_size < istep*nbw) then
call M_elpa_reduce_add_vectors_real_PRECISSION (vmrCUDA(cur_l_rows * n_cols + 1),cur_l_rows,mpi_comm_rows, &
umcCUDA, cur_l_cols, mpi_comm_cols, &
istep*nbw, n_cols, nblk)
endif
if (l_cols>0) then
allocate(tmpCUDA(l_cols * n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating tmpCUDA "//errorMessage
stop
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_allreduce(umcCUDA, tmpCUDA, l_cols*n_cols, M_MPI_REAL_PRECISSION, MPI_SUM, mpi_comm_rows, ierr)
umcCUDA(1 : l_cols * n_cols) = tmpCUDA(1 : l_cols * n_cols)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! tmpCUDA(1 : l_cols * n_cols) = umcCUDA(1 : l_cols * n_cols)
#endif /* WITH_MPI */
if (allocated(tmpCUDA)) then
deallocate(tmpCUDA, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when deallocating tmpCUDA "//errorMessage
stop
endif
endif
endif ! l_cols
! U = U * Tmat**T
successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1)), umc_size*M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*M_size_of_PRECISSION_real,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
call M_cublas_PRECISSION_trmm('Right', 'Upper', 'Trans', 'Nonunit', l_cols, n_cols, &
M_CONST_1_0, tmat_dev, nbw, umc_dev, cur_l_cols)
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*M_size_of_PRECISSION_real,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
call M_cublas_PRECISSION_gemm('T', 'N', n_cols, n_cols, l_cols, &
M_CONST_1_0, umc_dev, cur_l_cols, (umc_dev+(cur_l_cols * n_cols )*M_size_of_PRECISSION_real),cur_l_cols, &
M_CONST_0_0, vav_dev, nbw)
call M_cublas_PRECISSION_trmm('Right', 'Upper', 'Trans', 'Nonunit', n_cols, n_cols, &
M_CONST_1_0, tmat_dev, nbw, vav_dev, nbw)
successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev, nbw*nbw*M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
call M_symm_matrix_allreduce_PRECISSION(n_cols,vav, nbw,nbw,mpi_comm_cols)
successCUDA = cuda_memcpy(vav_dev, loc(vav(1,1)), nbw*nbw*M_size_of_PRECISSION_real,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
! U = U - 0.5 * V * VAV
call M_cublas_PRECISSION_gemm('N', 'N', l_cols, n_cols, n_cols,&
-M_CONST_0_5, (umc_dev+(cur_l_cols * n_cols )*M_size_of_PRECISSION_real),cur_l_cols, vav_dev,nbw,&
M_CONST_1_0, umc_dev, cur_l_cols)
successCUDA = cuda_memcpy(loc(umcCUDA(1)), umc_dev, umc_size*M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
! Transpose umc -> umr (stored in vmr, second half)
call M_elpa_transpose_vectors_real_PRECISSION (umcCUDA, cur_l_cols, mpi_comm_cols, &
vmrCUDA(cur_l_rows * n_cols + 1), cur_l_rows, mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
successCUDA = cuda_memcpy(vmr_dev, loc(vmrCUDA(1)), vmr_size*M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1)), umc_size*M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
! A = A - V*U**T - U*V**T
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
lre = min(l_rows,(i+1)*l_rows_tile)
if (lce 1) then
call M_elpa_reduce_add_vectors_real_PRECISSION (vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, &
umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
istep*nbw, n_cols, nblk)
endif
if (l_cols>0) then
allocate(tmpCPU(l_cols,n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating tmpCPU "//errorMessage
stop
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_allreduce(umcCPU, tmpCPU, l_cols*n_cols, M_MPI_REAL_PRECISSION, MPI_SUM, mpi_comm_rows, mpierr)
umcCPU(1:l_cols,1:n_cols) = tmpCPU(1:l_cols,1:n_cols)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! tmpCPU(1:l_cols,1:n_cols) = umcCPU(1:l_cols,1:n_cols)
#endif /* WITH_MPI */
deallocate(tmpCPU, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when deallocating tmpCPU "//errorMessage
stop
endif
endif
! U = U * Tmat**T
call M_PRECISSION_TRMM('Right', 'Upper', 'Trans', 'Nonunit', l_cols,n_cols, M_CONST_1_0, tmat(1,1,istep), ubound(tmat,dim=1), &
umcCPU, ubound(umcCPU,dim=1))
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
call M_PRECISSION_GEMM('T', 'N', n_cols, n_cols, l_cols, M_CONST_1_0, umcCPU, ubound(umcCPU,dim=1), umcCPU(1,n_cols+1), &
ubound(umcCPU,dim=1), M_CONST_0_0, vav, ubound(vav,dim=1))
call M_PRECISSION_TRMM('Right', 'Upper', 'Trans', 'Nonunit', n_cols, n_cols, M_CONST_1_0, tmat(1,1,istep), &
ubound(tmat,dim=1), vav, ubound(vav,dim=1))
call M_symm_matrix_allreduce_PRECISSION(n_cols,vav, nbw, nbw ,mpi_comm_cols)
! U = U - 0.5 * V * VAV
call M_PRECISSION_GEMM('N', 'N', l_cols, n_cols, n_cols, -M_CONST_0_5, umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), vav, &
ubound(vav,dim=1), M_CONST_1_0, umcCPU, ubound(umcCPU,dim=1))
! Transpose umc -> umr (stored in vmr, second half)
call M_elpa_transpose_vectors_real_PRECISSION(umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
! A = A - V*U**T - U*V**T
#ifdef WITH_OPENMP
!$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend )
n_threads = omp_get_num_threads()
if (mod(n_threads, 2) == 0) then
n_way = 2
else
n_way = 1
endif
m_way = n_threads / n_way
m_id = mod(omp_get_thread_num(), m_way)
n_id = omp_get_thread_num() / m_way
do ii=n_id*tile_size,(istep*nbw-1),tile_size*n_way
i = ii / tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
lre = min(l_rows,(i+1)*l_rows_tile)
if (lce lre ) myend = lre
if ( myend-mystart+1 < 1) cycle
call M_PRECISSION_GEMM('N', 'T', myend-mystart+1, lce-lcs+1, 2*n_cols, -M_CONST_1_0, &
vmrCPU(mystart, 1), ubound(vmrCPU,1), umcCPU(lcs,1), ubound(umcCPU,1), &
M_CONST_1_0, a(mystart,lcs), ubound(a,1))
enddo
!$omp end parallel
#else /* WITH_OPENMP */
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
lre = min(l_rows,(i+1)*l_rows_tile)
if (lce0) then
call M_cublas_PRECISSION_gemm('T', 'N', n_cols, l_cols, l_rows, M_CONST_1_0, hvm_dev, max_local_rows, &
q_dev, ldq , M_CONST_0_0, tmp_dev, n_cols)
#ifdef WITH_MPI
! copy data from device to host for a later MPI_ALLREDUCE
! copy to host maybe this can be avoided this is needed if MPI is used (allreduce)
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, l_cols*n_cols*M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
#endif /* WITH_MPI */
else ! l_rows>0
#ifdef WITH_MPI
tmp1(1:l_cols*n_cols) = 0
#else
! if MPI is not used (we do not need to transfer because of MPI_ALLREDUCE) we can set to zero on the device
#ifdef WITH_GPU_VERSION
successCUDA = cuda_memset(tmp_dev, 0, l_cols*n_cols*M_size_of_PRECISSION_real)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemset"
stop
endif
#endif
#endif /* WITH_MPI */
endif ! l_rows>0
!#ifdef WITH_GPU_VERSION
! istat = cuda_memcpy(loc(tmp1), tmp_dev, max_local_cols*nbw*size_of_real_datatype,cudaMemcpyDeviceToHost)
! if (istat .ne. 0) then
! print *,"error in cudaMemcpy"
! stop
! endif
!#endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, M_MPI_REAL_PRECISSION, MPI_SUM, mpi_comm_rows, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols)
#endif /* WITH_MPI */
!#ifdef WITH_GPU_VERSION
! istat = cuda_memcpy(tmp_dev, loc(tmp2), max_local_cols*nbw*size_of_real_datatype,cudaMemcpyHostToDevice)
! if (istat .ne. 0) then
! print *,"error in cudaMemcpy"
! stop
! endif
!#endif
if (l_rows>0) then
#ifdef WITH_MPI
! after the mpi_allreduce we have to copy back to the device
! copy back to device
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2), n_cols*l_cols*M_size_of_PRECISSION_real,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
#endif /* WITH_MPI */
!#ifdef WITH_MPI
! it should be possible to keep tmat on the device and not copy it aroud
! ! copy to device, maybe this can be avoided tmat is input from bandred_real
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*M_size_of_PRECISSION_real,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
!#endif /* WITH_MPI */
call M_cublas_PRECISSION_trmm('L', 'U', 'T', 'N', n_cols, l_cols, M_CONST_1_0, tmat_dev, nbw, tmp_dev, n_cols)
call M_cublas_PRECISSION_gemm('N', 'N', l_rows, l_cols, n_cols, -M_CONST_1_0, hvm_dev, max_local_rows, &
tmp_dev, n_cols, M_CONST_1_0, q_dev, ldq)
! copy to host maybe this can be avoided
! this is not necessary hvm is not used anymore
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*M_size_of_PRECISSION_real),cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
endif ! l_rows > 0
!#ifdef WITH_GPU_VERSION
! istat = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_real_datatype),cudaMemcpyDeviceToHost)
! if (istat .ne. 0) then
! print *,"error in cudaMemcpy"
! stop
! endif
!
!#endif
enddo ! istep
else ! do not useGPU
! t_blocking was formerly 2; 3 is a better choice
t_blocking = 3 ! number of matrices T (tmat) which are aggregated into a new (larger) T matrix (tmat_complete) and applied at once
! we only use the t_blocking if we could call it fully, this is might be better but needs to benchmarked.
! if ( na >= ((t_blocking+1)*nbw) ) then
cwy_blocking = t_blocking * nbw
allocate(tmp1(max_local_cols*cwy_blocking))
allocate(tmp2(max_local_cols*cwy_blocking))
allocate(hvb(max_local_rows*cwy_blocking))
allocate(hvm(max_local_rows,cwy_blocking))
allocate(tmat_complete(cwy_blocking,cwy_blocking))
allocate(t_tmp(cwy_blocking,nbw))
allocate(t_tmp2(cwy_blocking,nbw))
! else
! allocate(tmp1(max_local_cols*nbw))
! allocate(tmp2(max_local_cols*nbw))
! allocate(hvb(max_local_rows*nbw))
! allocate(hvm(max_local_rows,nbw))
! endif
hvm = M_CONST_0_0 ! Must be set to 0 !!!
hvb = M_CONST_0_0 ! Safety only
l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
! if ( na >= ((t_blocking+1)*nbw) ) then
do istep=1,((na-1)/nbw-1)/t_blocking + 1
! This the call when using na >= ((t_blocking+1)*nbw)
! n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step
! As an alternative we add some special case handling if na < cwy_blocking
IF (na < cwy_blocking) THEN
n_cols = MAX(0, na-nbw)
IF ( n_cols .eq. 0 ) THEN
EXIT
END IF
ELSE
n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step
END IF
! Broadcast all Householder vectors for current step compressed in hvb
nb = 0
ns = 0
do lc = 1, n_cols
ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder vector
nrow = ncol - nbw ! absolute number of pivot row
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
l_colh = local_index(ncol , my_pcol, np_cols, nblk, -1) ! HV local column number
if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)
nb = nb+l_rows
if (lc==n_cols .or. mod(ncol,nblk)==0) then
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Bcast(hvb(ns+1), nb-ns, M_MPI_REAL_PRECISSION, pcol(ncol, nblk, np_cols), mpi_comm_cols, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
ns = nb
endif
enddo
! Expand compressed Householder vectors into matrix hvm
nb = 0
do lc = 1, n_cols
nrow = (istep-1)*cwy_blocking + lc ! absolute number of pivot row
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = M_CONST_1_0
nb = nb+l_rows
enddo
l_rows = local_index(MIN(na,(istep+1)*cwy_blocking), my_prow, np_rows, nblk, -1)
! compute tmat2 out of tmat(:,:,)
tmat_complete = 0
do i = 1, t_blocking
t_cols = MIN(nbw, n_cols - (i-1)*nbw)
if (t_cols <= 0) exit
t_rows = (i - 1) * nbw
tmat_complete(t_rows+1:t_rows+t_cols,t_rows+1:t_rows+t_cols) = tmat(1:t_cols,1:t_cols,(istep-1)*t_blocking + i)
if (i > 1) then
call M_PRECISSION_GEMM('T', 'N', t_rows, t_cols, l_rows, M_CONST_1_0, hvm(1,1), max_local_rows, hvm(1,(i-1)*nbw+1), &
max_local_rows, M_CONST_0_0, t_tmp, cwy_blocking)
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_allreduce(t_tmp, t_tmp2, cwy_blocking*nbw, M_MPI_REAL_PRECISSION, MPI_SUM, mpi_comm_rows, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
call M_PRECISSION_TRMM('L', 'U', 'N', 'N', t_rows, t_cols, M_CONST_1_0, tmat_complete, cwy_blocking, t_tmp2, cwy_blocking)
call M_PRECISSION_TRMM('R', 'U', 'N', 'N', t_rows, t_cols, -M_CONST_1_0, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, &
t_tmp2, cwy_blocking)
tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)
#else
! t_tmp2(1:cwy_blocking,1:nbw) = t_tmp(1:cwy_blocking,1:nbw)
call M_PRECISSION_TRMM('L', 'U', 'N', 'N', t_rows, t_cols, M_CONST_1_0, tmat_complete, cwy_blocking, t_tmp, cwy_blocking)
call M_PRECISSION_TRMM('R', 'U', 'N', 'N', t_rows, t_cols, -M_CONST_1_0, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, &
t_tmp, cwy_blocking)
tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp(1:t_rows,1:t_cols)
#endif
! call M_PRECISSION_TRMM('L', 'U', 'N', 'N', t_rows, t_cols, M_CONST_1_0, tmat_complete, cwy_blocking, t_tmp2, cwy_blocking)
! call M_PRECISSION_TRMM('R', 'U', 'N', 'N', t_rows, t_cols, -M_CONST_1_0, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, &
! t_tmp2, cwy_blocking)
! tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)
endif
enddo
! Q = Q - V * T**T * V**T * Q
if (l_rows>0) then
call M_PRECISSION_GEMM('T', 'N', n_cols, l_cols, l_rows, M_CONST_1_0, hvm, ubound(hvm,dim=1), &
q, ldq, M_CONST_0_0, tmp1, n_cols)
else ! l_rows>0
tmp1(1:l_cols*n_cols) = M_CONST_0_0
endif ! l_rows>0
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, M_MPI_REAL_PRECISSION, MPI_SUM, mpi_comm_rows ,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
if (l_rows>0) then
call M_PRECISSION_TRMM('L', 'U', 'T', 'N', n_cols, l_cols, M_CONST_1_0, tmat_complete, cwy_blocking, tmp2, n_cols)
call M_PRECISSION_GEMM('N', 'N', l_rows, l_cols, n_cols, -M_CONST_1_0, hvm, ubound(hvm,dim=1), tmp2, n_cols, M_CONST_1_0, q, ldq)
endif
#else /* WITH_MPI */
! tmp2 = tmp1
if (l_rows>0) then
call M_PRECISSION_TRMM('L', 'U', 'T', 'N', n_cols, l_cols, M_CONST_1_0, tmat_complete, cwy_blocking, tmp1, n_cols)
call M_PRECISSION_GEMM('N', 'N', l_rows, l_cols, n_cols, -M_CONST_1_0, hvm, ubound(hvm,dim=1), tmp1, n_cols, M_CONST_1_0, q, ldq)
endif
#endif /* WITH_MPI */
! if (l_rows>0) then
! call M_PRECISSION_TRMM('L', 'U', 'T', 'N', n_cols, l_cols, M_CONST_1_0, tmat_complete, cwy_blocking, tmp2, n_cols)
! call M_PRECISSION_GEMM('N', 'N', l_rows, l_cols, n_cols, -M_CONST_1_0, hvm, ubound(hvm,dim=1), tmp2, n_cols, M_CONST_1_0, q, ldq)
! endif
enddo ! istep
endif ! useGPU
deallocate(tmp1, tmp2, hvb, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_band_to_full_real: error when deallocating tmp1 tmp2 hvb "//errorMessage
stop
endif
if (useGPU) then
successCUDA = cuda_free(hvm_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaFree"
stop
endif
successCUDA = cuda_free(tmp_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaFree"
stop
endif
successCUDA = cuda_free(tmat_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaFree"
stop
endif
! final transfer of q_dev
successCUDA = cuda_memcpy(loc(q), q_dev, ldq*matrixCols*M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaFree"
stop
endif
! q(1:ldq,1:na_cols) = q_temp(1:ldq,1:na_cols)
successCUDA = cuda_free(q_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaFree"
stop
endif
! deallocate(q_temp, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"error when deallocating q_temp "//errorMessage
! stop
! endif
! deallocate(tmat_temp, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"trans_ev_band_to_full_real: error when deallocating tmat_temp "//errorMessage
! stop
! endif
endif ! useGPU
deallocate(hvm, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_band_to_full_real: error when deallocating hvm "//errorMessage
stop
endif
if (useQr) then
deallocate(tmat_complete, t_tmp, t_tmp2, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_band_to_full_real: error when deallocating tmat_complete, t_tmp, t_tmp2 "//errorMessage
stop
endif
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("trans_ev_band_to_full_real" + M_PRECISSION_SUFFIX)
#endif
end subroutine M_trans_ev_band_to_full_real_PRECISSION
subroutine M_tridiag_band_real_PRECISSION(na, nb, nblk, a, lda, d, e, matrixCols, hh_trans_real, &
mpi_comm_rows, mpi_comm_cols, mpi_comm)
!-------------------------------------------------------------------------------
! tridiag_band_real:
! Reduces a real symmetric band matrix to tridiagonal form
!
! na Order of matrix a
!
! nb Semi bandwith
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! a(lda,matrixCols) Distributed system matrix reduced to banded form in the upper diagonal
!
! lda Leading dimension of a
! matrixCols local columns of matrix a
!
! d(na) Diagonal of tridiagonal matrix, set only on PE 0 (output)
!
! e(na) Subdiagonal of tridiagonal matrix, set only on PE 0 (output)
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
! mpi_comm
! MPI-Communicator for the total processor set
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
integer(kind=ik), intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE), intent(in) :: a(lda,*)
#else
real(kind=REAL_DATATYPE), intent(in) :: a(lda,matrixCols)
#endif
real(kind=REAL_DATATYPE), intent(out) :: d(na), e(na) ! set only on PE 0
real(kind=REAL_DATATYPE), intent(out), &
allocatable :: hh_trans_real(:,:)
real(kind=REAL_DATATYPE) :: vnorm2, hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
real(kind=REAL_DATATYPE) :: hd(nb), hs(nb)
integer(kind=ik) :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
integer(kind=ik) :: my_pe, n_pes, mpierr
integer(kind=ik) :: my_prow, np_rows, my_pcol, np_cols
integer(kind=ik) :: ireq_ab, ireq_hv
integer(kind=ik) :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
#ifdef WITH_OPENMP
integer(kind=ik) :: max_threads, my_thread, my_block_s, my_block_e, iter
#ifdef WITH_MPI
! integer(kind=ik) :: my_mpi_status(MPI_STATUS_SIZE)
#endif
! integer(kind=ik), allocatable :: mpi_statuses(:,:), global_id_tmp(:,:)
integer(kind=ik), allocatable :: global_id_tmp(:,:)
integer(kind=ik), allocatable :: omp_block_limits(:)
real(kind=REAL_DATATYPE), allocatable :: hv_t(:,:), tau_t(:)
#endif
integer(kind=ik), allocatable :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:)
integer(kind=ik), allocatable :: limits(:), snd_limits(:,:)
integer(kind=ik), allocatable :: block_limits(:)
real(kind=REAL_DATATYPE), allocatable :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
#ifdef WITH_OPENMP
integer(kind=ik) :: omp_get_max_threads
#endif
integer :: istat
character(200) :: errorMessage
#ifndef WITH_MPI
integer(kind=ik) :: startAddr
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("tridiag_band_real" + M_PRECISSION_SUFFIX)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_comm_rank(mpi_comm,my_pe,mpierr)
call mpi_comm_size(mpi_comm,n_pes,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
! Get global_id mapping 2D procssor coordinates to global id
allocate(global_id(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating global_id "//errorMessage
stop
endif
global_id(:,:) = 0
global_id(my_prow, my_pcol) = my_pe
#ifdef WITH_OPENMP
allocate(global_id_tmp(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating global_id_tmp "//errorMessage
stop
endif
#endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
#ifndef WITH_OPENMP
call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr)
#else
global_id_tmp(:,:) = global_id(:,:)
call mpi_allreduce(global_id_tmp, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr)
deallocate(global_id_tmp, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when deallocating global_id_tmp "//errorMessage
stop
endif
#endif /* WITH_OPENMP */
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
! Total number of blocks in the band:
nblocks_total = (na-1)/nb + 1
! Set work distribution
allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating block_limits"//errorMessage
stop
endif
call divide_band(nblocks_total, n_pes, block_limits)
! nblocks: the number of blocks for my task
nblocks = block_limits(my_pe+1) - block_limits(my_pe)
! allocate the part of the band matrix which is needed by this PE
! The size is 1 block larger than needed to avoid extensive shifts
allocate(ab(2*nb,(nblocks+1)*nb), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating ab"//errorMessage
stop
endif
ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety
! n_off: Offset of ab within band
n_off = block_limits(my_pe)*nb
! Redistribute band in a to ab
call M_redist_band_real_PRECISSION(a, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, ab)
! Calculate the workload for each sweep in the back transformation
! and the space requirements to hold the HH vectors
allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating limits"//errorMessage
stop
endif
call determine_workload(na, nb, np_rows, limits)
max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
num_hh_vecs = 0
num_chunks = 0
nx = na
do n = 1, nblocks_total
call determine_workload(nx, nb, np_rows, limits)
local_size = limits(my_prow+1) - limits(my_prow)
! add to number of householder vectors
! please note: for nx==1 the one and only HH vector is 0 and is neither calculated nor send below!
if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
num_hh_vecs = num_hh_vecs + local_size
num_chunks = num_chunks+1
endif
nx = nx - nb
enddo
! Allocate space for HH vectors
allocate(hh_trans_real(nb,num_hh_vecs), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating hh_trans_real"//errorMessage
stop
endif
! Allocate and init MPI requests
allocate(ireq_hhr(num_chunks), stat=istat, errmsg=errorMessage) ! Recv requests
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating ireq_hhr"//errorMessage
stop
endif
allocate(ireq_hhs(nblocks), stat=istat, errmsg=errorMessage) ! Send requests
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating ireq_hhs"//errorMessage
stop
endif
num_hh_vecs = 0
num_chunks = 0
nx = na
nt = 0
do n = 1, nblocks_total
call determine_workload(nx, nb, np_rows, limits)
local_size = limits(my_prow+1) - limits(my_prow)
if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
num_chunks = num_chunks+1
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_irecv(hh_trans_real(1,num_hh_vecs+1), nb*local_size, M_MPI_REAL_PRECISSION, nt, &
10+n-block_limits(nt), mpi_comm, ireq_hhr(num_chunks), mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! carefull non-block recv data copy must be done at wait or send
! hh_trans_real(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk)
#endif /* WITH_MPI */
num_hh_vecs = num_hh_vecs + local_size
endif
nx = nx - nb
if (n == block_limits(nt+1)) then
nt = nt + 1
endif
enddo
#ifdef WITH_MPI
ireq_hhs(:) = MPI_REQUEST_NULL
#endif
! Buffers for gathering/sending the HH vectors
allocate(hh_gath(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! gathers HH vectors
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating hh_gath"//errorMessage
stop
endif
allocate(hh_send(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! send buffer for HH vectors
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating hh_send"//errorMessage
stop
endif
hh_gath(:,:,:) = M_CONST_0_0
hh_send(:,:,:) = M_CONST_0_0
! Some counters
allocate(hh_cnt(nblocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating hh_cnt"//errorMessage
stop
endif
allocate(hh_dst(nblocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating hh_dst"//errorMessage
stop
endif
hh_cnt(:) = 1 ! The first transfomation vector is always 0 and not calculated at all
hh_dst(:) = 0 ! PE number for receive
#ifdef WITH_MPI
ireq_ab = MPI_REQUEST_NULL
ireq_hv = MPI_REQUEST_NULL
#endif
! Limits for sending
allocate(snd_limits(0:np_rows,nblocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating snd_limits"//errorMessage
stop
endif
do iblk=1,nblocks
call determine_workload(na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk))
enddo
#ifdef WITH_OPENMP
! OpenMP work distribution:
max_threads = 1
max_threads = omp_get_max_threads()
! For OpenMP we need at least 2 blocks for every thread
max_threads = MIN(max_threads, nblocks/2)
if (max_threads==0) max_threads = 1
allocate(omp_block_limits(0:max_threads), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating omp_block_limits"//errorMessage
stop
endif
! Get the OpenMP block limits
call divide_band(nblocks, max_threads, omp_block_limits)
allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating hv_t, tau_t"//errorMessage
stop
endif
hv_t = 0
tau_t = 0
#endif /* WITH_OPENMP */
! ---------------------------------------------------------------------------
! Start of calculations
na_s = block_limits(my_pe)*nb + 1
if (my_pe>0 .and. na_s<=na) then
! send first column to previous PE
! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_isend(ab_s, nb+1, M_MPI_REAL_PRECISSION, my_pe-1, 1, mpi_comm, ireq_ab, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
endif
#ifndef WITH_MPI
startAddr = ubound(hh_trans_real,dim=2)
#endif
#ifdef WITH_OPENMP
do istep=1,na-1-block_limits(my_pe)*nb
#else
do istep=1,na-1
#endif
if (my_pe==0) then
n = MIN(na-na_s,nb) ! number of rows to be reduced
hv(:) = M_CONST_0_0
tau = M_CONST_0_0
! The last step (istep=na-1) is only needed for sending the last HH vectors.
! We don't want the sign of the last element flipped (analogous to the other sweeps)
if (istep < na-1) then
! Transform first column of remaining matrix
vnorm2 = sum(ab(3:n+1,na_s-n_off)**2)
call M_hh_transform_real_PRECISSION(ab(2,na_s-n_off),vnorm2,hf,tau)
hv(1) = M_CONST_1_0
hv(2:n) = ab(3:n+1,na_s-n_off)*hf
endif
d(istep) = ab(1,na_s-n_off)
e(istep) = ab(2,na_s-n_off)
if (istep == na-1) then
d(na) = ab(1,na_s+1-n_off)
e(na) = M_CONST_0_0
endif
else
if (na>na_s) then
! Receive Householder vector from previous task, from PE owning subdiagonal
#ifdef WITH_OPENMP
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_recv(hv, nb, M_MPI_REAL_PRECISSION, my_pe-1, 2, mpi_comm, MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
hv(1:nb) = hv_s(1:nb)
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_recv(hv, nb, M_MPI_REAL_PRECISSION, my_pe-1, 2, mpi_comm, MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
hv(1:nb) = hv_s(1:nb)
#endif /* WITH_MPI */
#endif /* WITH_OPENMP */
tau = hv(1)
hv(1) = M_CONST_1_0
endif
endif
na_s = na_s+1
if (na_s-n_off > nb) then
ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
ab(:,nblocks*nb+1:(nblocks+1)*nb) = M_CONST_0_0
n_off = n_off + nb
endif
#ifdef WITH_OPENMP
if (max_threads > 1) then
! Codepath for OpenMP
! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread!
! Every thread is one reduction cycle behind its predecessor and thus starts one step later.
! This simulates the behaviour of the MPI tasks which also work after each other.
! The code would be considerably easier, if the MPI communication would be made within
! the parallel region - this is avoided here since this would require
! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program.
hv_t(:,1) = hv
tau_t(1) = tau
do iter = 1, 2
! iter=1 : work on first block
! iter=2 : work on remaining blocks
! This is done in 2 iterations so that we have a barrier in between:
! After the first iteration, it is guaranteed that the last row of the last block
! is completed by the next thread.
! After the first iteration it is also the place to exchange the last row
! with MPI calls
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif
!$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
!$omp& nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads)
do my_thread = 1, max_threads
if (iter == 1) then
my_block_s = omp_block_limits(my_thread-1) + 1
my_block_e = my_block_s
else
my_block_s = omp_block_limits(my_thread-1) + 2
my_block_e = omp_block_limits(my_thread)
endif
do iblk = my_block_s, my_block_e
ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block
ne = ns+nb-1 ! last column in block
if (istepna) exit
hv = hv_t(:,my_thread)
tau = tau_t(my_thread)
! Store Householder vector for back transformation
hh_cnt(iblk) = hh_cnt(iblk) + 1
hh_gath(1 ,hh_cnt(iblk),iblk) = tau
hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
! Note that nr>=0 implies that diagonal block is full (nc==nb)!
! Transform diagonal block
call M_PRECISSION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, M_CONST_0_0, hd, 1)
x = dot_product(hv(1:nc),hd(1:nc))*tau
hd(1:nc) = hd(1:nc) - M_CONST_0_5*x*hv(1:nc)
call M_PRECISSION_SYR2('L', nc, -M_CONST_1_0 ,hd, 1, hv, 1, ab(1,ns), 2*nb-1)
hv_t(:,my_thread) = M_CONST_0_0
tau_t(my_thread) = M_CONST_0_0
if (nr<=0) cycle ! No subdiagonal block present any more
! Transform subdiagonal block
call M_PRECISSION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, M_CONST_0_0, hs, 1)
if (nr>1) then
! complete (old) Householder transformation for first column
ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
! calculate new Householder transformation for first column
! (stored in hv_t(:,my_thread) and tau_t(my_thread))
vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
call M_hh_transform_real_PRECISSION(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread))
hv_t(1 ,my_thread) = M_CONST_1_0
hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf
ab(nb+2:,ns) = M_CONST_0_0
! update subdiagonal block for old and new Householder transformation
! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster
call M_PRECISSION_GEMV('T',nr, nb-1, tau_t(my_thread), ab(nb,ns+1), 2*nb-1, hv_t(1,my_thread), 1, M_CONST_0_0, h(2), 1)
x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread)
h(2:nb) = h(2:nb) - x*hv(2:nb)
! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2")
do i=2,nb
ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_t(1:nr,my_thread)*h(i) - hs(1:nr)*hv(i)
enddo
else
! No new Householder transformation for nr=1, just complete the old one
ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1
do i=2,nb
ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i)
enddo
! For safety: there is one remaining dummy transformation (but tau is 0 anyways)
hv_t(1,my_thread) = M_CONST_1_0
endif
enddo
enddo ! my_thread
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif
if (iter==1) then
! We are at the end of the first block
! Send our first column to previous PE
if (my_pe>0 .and. na_s <= na) then
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_wait(ireq_ab, MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_isend(ab_s, nb+1, M_MPI_REAL_PRECISSION, my_pe-1, 1, mpi_comm, ireq_ab, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
endif
! Request last column from next PE
ne = na_s + nblocks*nb - (max_threads-1) - 1
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
if (istep>=max_threads .and. ne <= na) then
call mpi_recv(ab(1,ne-n_off), nb+1, M_MPI_REAL_PRECISSION, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr)
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
if (istep>=max_threads .and. ne <= na) then
ab(1:nb+1,ne-n_off) = ab_s(1:nb+1)
endif
#endif /* WITH_MPI */
else
! We are at the end of all blocks
! Send last HH vector and TAU to next PE if it has been calculated above
ne = na_s + nblocks*nb - (max_threads-1) - 1
if (istep>=max_threads .and. ne < na) then
#ifdef WITH_MPI
call mpi_wait(ireq_hv, MPI_STATUS_IGNORE, mpierr)
#endif
hv_s(1) = tau_t(max_threads)
hv_s(2:) = hv_t(2:,max_threads)
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_isend(hv_s, nb, M_MPI_REAL_PRECISSION, my_pe+1, 2, mpi_comm, ireq_hv, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
endif
! "Send" HH vector and TAU to next OpenMP thread
do my_thread = max_threads, 2, -1
hv_t(:,my_thread) = hv_t(:,my_thread-1)
tau_t(my_thread) = tau_t(my_thread-1)
enddo
endif
enddo ! iter
else
! Codepath for 1 thread without OpenMP
! The following code is structured in a way to keep waiting times for
! other PEs at a minimum, especially if there is only one block.
! For this reason, it requests the last column as late as possible
! and sends the Householder vector and the first column as early
! as possible.
#endif /* WITH_OPENMP */
do iblk=1,nblocks
ns = na_s + (iblk-1)*nb - n_off ! first column in block
ne = ns+nb-1 ! last column in block
if (ns+n_off>na) exit
! Store Householder vector for back transformation
hh_cnt(iblk) = hh_cnt(iblk) + 1
hh_gath(1 ,hh_cnt(iblk),iblk) = tau
hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
#ifndef WITH_OPENMP
if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
! Wait for last transfer to finish
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
! Copy vectors into send buffer
hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
! Send to destination
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), M_MPI_REAL_PRECISSION, &
global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1,np_cols)), &
10+iblk, mpi_comm, ireq_hhs(iblk), mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! do the post-poned irecv here
startAddr = startAddr - hh_cnt(iblk)
hh_trans_real(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk)
#endif /* WITH_MPI */
! Reset counter and increase destination row
hh_cnt(iblk) = M_CONST_0_0
hh_dst(iblk) = hh_dst(iblk)+1
endif
! The following code is structured in a way to keep waiting times for
! other PEs at a minimum, especially if there is only one block.
! For this reason, it requests the last column as late as possible
! and sends the Householder vector and the first column as early
! as possible.
#endif /* WITH_OPENMP */
nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
! Note that nr>=0 implies that diagonal block is full (nc==nb)!
! Multiply diagonal block and subdiagonal block with Householder vector
if (iblk==nblocks .and. nc==nb) then
! We need the last column from the next PE.
! First do the matrix multiplications without last column ...
! Diagonal block, the contribution of the last element is added below!
ab(1,ne) = M_CONST_0_0
call M_PRECISSION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, M_CONST_0_0, hd, 1)
! Subdiagonal block
if (nr>0) call M_PRECISSION_GEMV('N', nr, nb-1, tau, ab(nb+1,ns), 2*nb-1, hv, 1, M_CONST_0_0, hs, 1)
! ... then request last column ...
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
#ifdef WITH_OPENMP
call mpi_recv(ab(1,ne), nb+1, M_MPI_REAL_PRECISSION, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr)
#else
call mpi_recv(ab(1,ne), nb+1, M_MPI_REAL_PRECISSION, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
ab(1:nb+1,ne) = ab_s(1:nb+1)
#endif /* WITH_MPI */
! ... and complete the result
hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb)
hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau
else
! Normal matrix multiply
call M_PRECISSION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, M_CONST_0_0, hd, 1)
if (nr>0) call M_PRECISSION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, M_CONST_0_0, hs, 1)
endif
! Calculate first column of subdiagonal block and calculate new
! Householder transformation for this column
hv_new(:) = M_CONST_0_0 ! Needed, last rows must be 0 for nr < nb
tau_new = M_CONST_0_0
if (nr>0) then
! complete (old) Householder transformation for first column
ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
! calculate new Householder transformation ...
if (nr>1) then
vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
call M_hh_transform_real_PRECISSION(ab(nb+1,ns),vnorm2,hf,tau_new)
hv_new(1) = M_CONST_1_0
hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf
ab(nb+2:,ns) = M_CONST_0_0
endif
! ... and send it away immediatly if this is the last block
if (iblk==nblocks) then
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
#ifdef WITH_OPENMP
call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
#else
call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
hv_s(1) = tau_new
hv_s(2:) = hv_new(2:)
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_isend(hv_s, nb, M_MPI_REAL_PRECISSION, my_pe+1, 2, mpi_comm, ireq_hv, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
endif
endif
! Transform diagonal block
x = dot_product(hv(1:nc),hd(1:nc))*tau
hd(1:nc) = hd(1:nc) - M_CONST_0_5*x*hv(1:nc)
if (my_pe>0 .and. iblk==1) then
! The first column of the diagonal block has to be send to the previous PE
! Calculate first column only ...
ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*hv(1) - hv(1:nc)*hd(1)
! ... send it away ...
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
#ifdef WITH_OPENMP
call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
#else
call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
ab_s(1:nb+1) = ab(1:nb+1,ns)
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_isend(ab_s, nb+1, M_MPI_REAL_PRECISSION, my_pe-1, 1, mpi_comm, ireq_ab, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
! ... and calculate remaining columns with rank-2 update
if (nc>1) call M_PRECISSION_SYR2('L', nc-1, -M_CONST_1_0, hd(2), 1, hv(2), 1, ab(1,ns+1), 2*nb-1)
else
! No need to send, just a rank-2 update
call M_PRECISSION_SYR2('L', nc, -M_CONST_1_0, hd, 1, hv, 1, ab(1,ns), 2*nb-1)
endif
! Do the remaining double Householder transformation on the subdiagonal block cols 2 ... nb
if (nr>0) then
if (nr>1) then
call M_PRECISSION_GEMV('T', nr, nb-1, tau_new, ab(nb,ns+1), 2*nb-1, hv_new, 1, M_CONST_0_0, h(2), 1)
x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new
h(2:nb) = h(2:nb) - x*hv(2:nb)
! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update
do i=2,nb
ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*h(i) - hs(1:nr)*hv(i)
enddo
else
! No double Householder transformation for nr=1, just complete the row
do i=2,nb
ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i)
enddo
endif
endif
! Use new HH vector for the next block
hv(:) = hv_new(:)
tau = tau_new
enddo
#ifdef WITH_OPENMP
endif
do iblk = 1, nblocks
if (hh_dst(iblk) >= np_rows) exit
if (snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit
if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
! Wait for last transfer to finish
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
! Copy vectors into send buffer
hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
! Send to destination
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), M_MPI_REAL_PRECISSION, &
global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1, np_cols)), &
10+iblk, mpi_comm, ireq_hhs(iblk), mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! do the post-poned irecv here
startAddr = startAddr - hh_cnt(iblk)
hh_trans_real(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk)
#endif /* WITH_MPI */
! Reset counter and increase destination row
hh_cnt(iblk) = M_CONST_0_0
hh_dst(iblk) = hh_dst(iblk)+1
endif
enddo
#endif /* WITH_OPENMP */
enddo ! istep
! Finish the last outstanding requests
#ifdef WITH_OPENMP
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
! allocate(mpi_statuses(MPI_STATUS_SIZE,max(nblocks,num_chunks)), stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"tridiag_band_real: error when allocating mpi_statuses"//errorMessage
! stop
! endif
call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr)
call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr)
! deallocate(mpi_statuses, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"tridiag_band_real: error when deallocating mpi_statuses"//errorMessage
! stop
! endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
#else /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr)
call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
#endif /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_barrier(mpi_comm,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
deallocate(ab, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when deallocating ab"//errorMessage
stop
endif
deallocate(ireq_hhr, ireq_hhs, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when deallocating ireq_hhr, ireq_hhs"//errorMessage
stop
endif
deallocate(hh_cnt, hh_dst, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when deallocating hh_cnt, hh_dst"//errorMessage
stop
endif
deallocate(hh_gath, hh_send, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when deallocating hh_gath, hh_send"//errorMessage
stop
endif
deallocate(limits, snd_limits, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when deallocating limits, send_limits"//errorMessage
stop
endif
deallocate(block_limits, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when deallocating block_limits"//errorMessage
stop
endif
deallocate(global_id, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating global_id"//errorMessage
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("tridiag_band_real" + M_PRECISSION_SUFFIX)
#endif
end subroutine M_tridiag_band_real_PRECISSION
subroutine M_trans_ev_tridi_to_band_real_PRECISSION(na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, hh_trans_real, &
mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, &
THIS_REAL_ELPA_KERNEL)
!-------------------------------------------------------------------------------
! trans_ev_tridi_to_band_real:
! Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix
!
! Parameters
!
! na Order of matrix a, number of rows of matrix q
!
! nev Number eigenvectors to compute (= columns of matrix q)
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! nb semi bandwith
!
! q On input: Eigenvectors of tridiagonal matrix
! On output: Transformed eigenvectors
! Distribution is like in Scalapack.
!
! ldq Leading dimension of q
! matrixCols local columns of matrix q
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns/both
!
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use cuda_functions
use precision
use pack_unpack_real
use pack_unpack_real_gpu
use compute_hh_trafo_real
use iso_c_binding
implicit none
logical, intent(in) :: useGPU
integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL
integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE) :: q(ldq,*)
#else
real(kind=REAL_DATATYPE) :: q(ldq,matrixCols)
#endif
integer(kind=c_intptr_t) :: q_dev
real(kind=REAL_DATATYPE), intent(in) :: hh_trans_real(:,:)
integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol
integer(kind=ik) :: i, j, ip, sweep, nbuf, l_nev, a_dim2
integer(kind=ik) :: current_n, current_local_n, current_n_start, current_n_end
integer(kind=ik) :: next_n, next_local_n, next_n_start, next_n_end
integer(kind=ik) :: bottom_msg_length, top_msg_length, next_top_msg_length
integer(kind=ik) :: stripe_width, last_stripe_width, stripe_count
#ifdef WITH_OPENMP
integer(kind=ik) :: thread_width, csw, b_off, b_len
#endif
integer(kind=ik) :: num_result_blocks, num_result_buffers, num_bufs_recvd
integer(kind=ik) :: a_off, current_tv_off, max_blk_size
integer(kind=ik) :: mpierr, src, src_offset, dst, offset, nfact, num_blk
#ifdef WITH_OPENMP
#ifdef WITH_MPI
! integer(kind=ik) :: my_MPI_STATUS_(MPI_STATUS_SIZE)
#endif
#endif
logical :: flag
#ifdef WITH_OPENMP
real(kind=REAL_DATATYPE), pointer :: aIntern(:,:,:,:)
#else
real(kind=REAL_DATATYPE), pointer :: aIntern(:,:,:)
#endif
real(kind=REAL_DATATYPE) :: a_real
type(c_ptr) :: aIntern_ptr
real(kind=REAL_DATATYPE) , allocatable :: row(:)
real(kind=REAL_DATATYPE) , allocatable :: row_group(:,:)
#ifdef WITH_OPENMP
real(kind=REAL_DATATYPE), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
real(kind=REAL_DATATYPE), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
#else
real(kind=REAL_DATATYPE), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
real(kind=REAL_DATATYPE), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
#endif
real(kind=REAL_DATATYPE), allocatable :: result_buffer(:,:,:)
real(kind=REAL_DATATYPE), allocatable :: bcast_buffer(:,:)
integer(kind=ik) :: tmp
! real*8, allocatable, device :: a_dev(:,:,:)
! real*8, allocatable, device :: bcast_buffer_dev(:,:)
! real*8, allocatable, device :: row_dev(:)
! real*8, allocatable, device :: row_group_dev(:,:)
! real*8, allocatable, device :: hh_dot_dev(:)
! real*8, allocatable, device :: hh_tau_dev(:)
integer(kind=c_intptr_t) :: aIntern_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev
integer(kind=c_size_t) :: num
integer(kind=c_size_t) :: dev_offset, dev_offset_1
integer(kind=c_intptr_t) :: row_dev
integer(kind=c_intptr_t) :: row_group_dev
integer(kind=c_intptr_t) :: hh_dot_dev
integer(kind=c_intptr_t) :: hh_tau_dev
Integer(kind=ik) :: top, chunk, this_chunk
integer(kind=ik) :: row_group_size, unpack_idx
integer(kind=ik) :: n_off
integer(kind=ik), allocatable :: result_send_request(:), result_recv_request(:), limits(:)
integer(kind=ik), allocatable :: top_send_request(:), bottom_send_request(:)
integer(kind=ik), allocatable :: top_recv_request(:), bottom_recv_request(:)
#ifdef WITH_OPENMP
! integer(kind=ik), allocatable :: mpi_statuses(:,:)
#endif
! MPI send/recv tags, arbitrary
integer(kind=ik), parameter :: bottom_recv_tag = 111
integer(kind=ik), parameter :: top_recv_tag = 222
integer(kind=ik), parameter :: result_recv_tag = 333
! Just for measuring the kernel performance
real(kind=c_double) :: kernel_time, kernel_time_recv ! MPI_WTIME always needs double
! long integer
integer(kind=lik) :: kernel_flops, kernel_flops_recv
#ifdef WITH_OPENMP
integer(kind=ik) :: max_threads, my_thread
integer(kind=ik) :: omp_get_max_threads
#endif
logical, intent(in) :: wantDebug
logical :: success
integer(kind=ik) :: istat
character(200) :: errorMessage
logical :: successCUDA
#ifndef WITH_MPI
integer(kind=ik) :: j1
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("trans_ev_tridi_to_band_real" + M_PRECISSION_SUFFIX)
#endif
#ifdef WITH_GPU_VERSION
unpack_idx = 0
row_group_size = 0
#endif
success = .true.
kernel_time = 0.0
kernel_flops = 0
#ifdef WITH_OPENMP
max_threads = 1
max_threads = omp_get_max_threads()
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr)
call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr)
call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr)
call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
if (mod(nbw,nblk)/=0) then
if (my_prow==0 .and. my_pcol==0) then
if (wantDebug) then
write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_real: ERROR: nbw=',nbw,', nblk=',nblk
write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_real: band backtransform works only for nbw==n*nblk'
endif
success = .false.
return
endif
endif
nfact = nbw / nblk
! local number of eigenvectors
l_nev = local_index(nev, my_pcol, np_cols, nblk, -1)
if (l_nev==0) then
#ifdef WITH_OPENMP
thread_width = 0
#endif
stripe_width = 0
stripe_count = 0
last_stripe_width = 0
else
! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
! every primary cache
if (.not.(useGPU)) then
#ifdef WITH_OPENMP
thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread
#endif
#ifdef DOUBLE_PRECISION_REAL
stripe_width = 48 ! Must be a multiple of 4
#else
stripe_width = 96 ! Must be a multiple of 8
#endif
#ifdef WITH_OPENMP
stripe_count = (thread_width-1)/stripe_width + 1
#else
stripe_count = (l_nev-1)/stripe_width + 1
#endif
! Adapt stripe width so that last one doesn't get too small
#ifdef WITH_OPENMP
stripe_width = (thread_width-1)/stripe_count + 1
#else
stripe_width = (l_nev-1)/stripe_count + 1
#endif
#ifdef DOUBLE_PRECISION_REAL
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
! (4 * sizeof(double) == 32)
#else
stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
! (8 * sizeof(float) == 32)
#endif
else ! GPUs are used
stripe_width = 256 ! Must be a multiple of 4
stripe_count = (l_nev - 1) / stripe_width + 1
endif
last_stripe_width = l_nev - (stripe_count-1)*stripe_width
endif
! Determine the matrix distribution at the beginning
allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating limits"//errorMessage
stop
endif
call determine_workload(na, nbw, np_rows, limits)
max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
a_dim2 = max_blk_size + nbw
if (useGPU) then
num = (stripe_width*a_dim2*stripe_count)*M_size_of_PRECISSION_real
successCUDA = cuda_malloc(aIntern_dev, stripe_width*a_dim2*stripe_count*M_size_of_PRECISSION_real)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMalloc"//errorMessage
stop
endif
successCUDA = cuda_memset(aIntern_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemset"//errorMessage
stop
endif
else ! GPUs are not used
#if 0
!DEC$ ATTRIBUTES ALIGN: 64:: aIntern
#endif
#ifdef WITH_OPENMP
if (posix_memalign(aIntern_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*max_threads*C_SIZEOF(a_real)) /= 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating aIntern"//errorMessage
stop
endif
call c_f_pointer(aIntern_ptr, aIntern, [stripe_width,a_dim2,stripe_count,max_threads])
! allocate(aIntern(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
! aIntern(:,:,:,:) should be set to 0 in a parallel region, not here!
#else
if (posix_memalign(aIntern_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*C_SIZEOF(a_real)) /= 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating aIntern"//errorMessage
stop
endif
call c_f_pointer(aIntern_ptr, aIntern,[stripe_width,a_dim2,stripe_count] )
!allocate(aIntern(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
aIntern(:,:,:) = M_CONST_0_0
#endif /* WITH_OPENMP */
endif !useGPU
allocate(row(l_nev), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating row"//errorMessage
stop
endif
row(:) = M_CONST_0_0
if (useGPU) then
num = (l_nev)*M_size_of_PRECISSION_real
successCUDA = cuda_malloc( row_dev,num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMalloc "//errorMessage
stop
endif
successCUDA = cuda_memset(row_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemset"//errorMessage
stop
endif
! "row_group" and "row_group_dev" are needed for GPU optimizations
allocate(row_group(l_nev, nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating row_group"//errorMessage
stop
endif
row_group(:, :) = M_CONST_0_0
num = (l_nev*nblk)*M_size_of_PRECISSION_real
! call cuda_malloc2d( row_group_dev,l_nev*M_size_of_PRECISSION_real,nblk*size_of_real_datatype)
successCUDA = cuda_malloc(row_group_dev, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMalloc"//errorMessage
stop
endif
successCUDA = cuda_memset(row_group_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemset"//errorMessage
stop
endif
endif ! useGPU
! Copy q from a block cyclic distribution into a distribution with contiguous rows,
! and transpose the matrix using stripes of given stripe_width for cache blocking.
! The peculiar way it is done below is due to the fact that the last row should be
! ready first since it is the first one to start below
#ifdef WITH_OPENMP
! Please note about the OMP usage below:
! This is not for speed, but because we want the matrix a in the memory and
! in the cache of the correct thread (if possible)
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
aIntern(:,:,:,my_thread) = M_CONST_0_0 ! if possible, do first touch allocation!
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif
#endif /* WITH_OPENMP */
do ip = np_rows-1, 0, -1
if (my_prow == ip) then
! Receive my rows which have not yet been received
src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
do i=limits(ip)+1,limits(ip+1)
src = mod((i-1)/nblk, np_rows)
if (src < my_prow) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Recv(row, l_nev, M_MPI_REAL_PRECISSION, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! row(1:l_nev) = row(1:l_nev)
#endif /* WITH_MPI */
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call M_unpack_row_real_cpu_openmp_PRECISSION(aIntern, row, i-limits(ip), my_thread, stripe_count, &
thread_width, stripe_width, l_nev)
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
if (useGPU) then
! An unpacking of the current row group may occur before queuing the next row
call M_unpack_and_prepare_row_group_real_gpu_PRECISSION(row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, i - limits(ip), .false.)
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Recv(row_group(:, row_group_size), l_nev, M_MPI_REAL_PRECISSION, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct?
#endif /* WITH_MPI */
else
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Recv(row, l_nev, M_MPI_REAL_PRECISSION, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! row(1:l_nev) = row(1:l_nev)
#endif /* WITH_MPI */
call M_unpack_row_real_cpu_PRECISSION(aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
endif
#endif /* WITH_OPENMP */
elseif (src==my_prow) then
src_offset = src_offset+1
if (.not.(useGPU)) row(:) = q(src_offset, 1:l_nev)
#ifdef WITH_OPENMP
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif /* HAVE_DETAILED_TIMINGS */
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call M_unpack_row_real_cpu_openmp_PRECISSION(aIntern, row, i-limits(ip), my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
if (useGPU) then
! An unpacking of the current row group may occur before queuing the next row
call M_unpack_and_prepare_row_group_real_gpu_PRECISSION(row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, i - limits(ip), .false.)
row_group(:, row_group_size) = q(src_offset, 1:l_nev)
else
call M_unpack_row_real_cpu_PRECISSION(aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
endif
#endif /* WITH_OPENMP */
endif
enddo
! Send all rows which have not yet been send
src_offset = 0
do dst = 0, ip-1
do i=limits(dst)+1,limits(dst+1)
if (mod((i-1)/nblk, np_rows) == my_prow) then
src_offset = src_offset+1
row(:) = q(src_offset, 1:l_nev)
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Send(row, l_nev, M_MPI_REAL_PRECISSION, dst, 0, mpi_comm_rows, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
endif
enddo
enddo
else if (my_prow < ip) then
! Send all rows going to PE ip
src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
do i=limits(ip)+1,limits(ip+1)
src = mod((i-1)/nblk, np_rows)
if (src == my_prow) then
src_offset = src_offset+1
row(:) = q(src_offset, 1:l_nev)
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Send(row, l_nev, M_MPI_REAL_PRECISSION, ip, 0, mpi_comm_rows, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
endif
enddo
! Receive all rows from PE ip
do i=limits(my_prow)+1,limits(my_prow+1)
src = mod((i-1)/nblk, np_rows)
if (src == ip) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Recv(row, l_nev, M_MPI_REAL_PRECISSION, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! row(1:l_nev) = row(1:l_nev)
#endif /* WITH_MPI */
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call M_unpack_row_real_cpu_openmp_PRECISSION(aIntern, row, i-limits(my_prow), my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif
#else /* WITH_OPENMP */
if (useGPU) then
! An unpacking of the current row group may occur before queuing the next row
call M_unpack_and_prepare_row_group_real_gpu_PRECISSION(row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev, &
row_group_size, nblk, unpack_idx, i - limits(my_prow), .false.)
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Recv(row_group(:, row_group_size), l_nev, M_MPI_REAL_PRECISSION, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else
row_group(1:l_nev,row_group_size) = row(1:l_nev) ! is this correct ?
#endif
else
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Recv(row, l_nev, M_MPI_REAL_PRECISSION, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! row(1:l_nev) = row(1:l_nev)
#endif
call M_unpack_row_real_cpu_PRECISSION(aIntern, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width)
endif
#endif /* WITH_OPENMP */
endif
enddo
endif
enddo
if (useGPU) then
! Force an unpacking of all remaining rows that haven't been unpacked yet
call M_unpack_and_prepare_row_group_real_gpu_PRECISSION(row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, &
a_dim2, l_nev, row_group_size, nblk, unpack_idx, -1, .true.)
successCUDA = cuda_devicesynchronize()
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaDeviceSynchronize"//errorMessage
stop
endif
endif
! Set up result buffer queue
num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows
num_result_buffers = 4*nfact
allocate(result_buffer(l_nev,nblk,num_result_buffers), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating result_buffer"//errorMessage
stop
endif
allocate(result_send_request(num_result_buffers), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating result_send_request"//errorMessage
stop
endif
allocate(result_recv_request(num_result_buffers), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating result_recv_request"//errorMessage
stop
endif
#ifdef WITH_MPI
result_send_request(:) = MPI_REQUEST_NULL
result_recv_request(:) = MPI_REQUEST_NULL
#endif
! Queue up buffers
if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
do j = 1, min(num_result_buffers, num_result_blocks)
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, M_MPI_REAL_PRECISSION, 0, result_recv_tag, &
mpi_comm_rows, result_recv_request(j), mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! carefull the "recv" has to be done at the corresponding wait or send
! result_buffer(1: l_nev*nblk,1,j) =result_buffer(1:l_nev*nblk,1,nbuf)
#endif /* WITH_MPI */
enddo
endif
num_bufs_recvd = 0 ! No buffers received yet
! Initialize top/bottom requests
allocate(top_send_request(stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating top_send_request"//errorMessage
stop
endif
allocate(top_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating top_recv_request"//errorMessage
stop
endif
allocate(bottom_send_request(stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating bottom_send_request"//errorMessage
stop
endif
allocate(bottom_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating bottom_recv_request"//errorMessage
stop
endif
#ifdef WITH_MPI
top_send_request(:) = MPI_REQUEST_NULL
top_recv_request(:) = MPI_REQUEST_NULL
bottom_send_request(:) = MPI_REQUEST_NULL
bottom_recv_request(:) = MPI_REQUEST_NULL
#endif
#ifdef WITH_OPENMP
allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating top_border_send_buffer"//errorMessage
stop
endif
allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating top_border_recv_buffer"//errorMessage
stop
endif
allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating bottom_border_send_buffer"//errorMessage
stop
endif
allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating bottom_border_recv_buffer"//errorMessage
stop
endif
top_border_send_buffer(:,:) = M_CONST_0_0
top_border_recv_buffer(:,:) = M_CONST_0_0
bottom_border_send_buffer(:,:) = M_CONST_0_0
bottom_border_recv_buffer(:,:) = M_CONST_0_0
! Initialize broadcast buffer
#else /* WITH_OPENMP */
allocate(top_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating top_border_send_bufer"//errorMessage
stop
endif
allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating top_border_recv_buffer"//errorMessage
stop
endif
allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating bottom_border_send_buffer"//errorMessage
stop
endif
allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating bottom_border_recv_buffer"//errorMessage
stop
endif
top_border_send_buffer(:,:,:) = M_CONST_0_0
top_border_recv_buffer(:,:,:) = M_CONST_0_0
bottom_border_send_buffer(:,:,:) = M_CONST_0_0
bottom_border_recv_buffer(:,:,:) = M_CONST_0_0
#endif /* WITH_OPENMP */
allocate(bcast_buffer(nbw, max_blk_size), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating bcast_buffer"//errorMessage
stop
endif
bcast_buffer = M_CONST_0_0
if (useGPU) then
num = ( nbw * max_blk_size) * M_size_of_PRECISSION_real
successCUDA = cuda_malloc(bcast_buffer_dev, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMalloc"
stop
endif
successCUDA = cuda_memset( bcast_buffer_dev, 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemset"
stop
endif
num = ((max_blk_size-1))*M_size_of_PRECISSION_real
successCUDA = cuda_malloc( hh_dot_dev, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMalloc"
stop
endif
successCUDA = cuda_memset( hh_dot_dev, 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemset"
stop
endif
num = (max_blk_size)*M_size_of_PRECISSION_real
successCUDA = cuda_malloc( hh_tau_dev, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMalloc"
stop
endif
successCUDA = cuda_memset( hh_tau_dev, 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemset"
stop
endif
endif ! useGPU
current_tv_off = 0 ! Offset of next row to be broadcast
! ------------------- start of work loop -------------------
a_off = 0 ! offset in aIntern (to avoid unnecessary shifts)
top_msg_length = 0
bottom_msg_length = 0
do sweep = 0, (na-1)/nbw
current_n = na - sweep*nbw
call determine_workload(current_n, nbw, np_rows, limits)
current_n_start = limits(my_prow)
current_n_end = limits(my_prow+1)
current_local_n = current_n_end - current_n_start
next_n = max(current_n - nbw, 0)
call determine_workload(next_n, nbw, np_rows, limits)
next_n_start = limits(my_prow)
next_n_end = limits(my_prow+1)
next_local_n = next_n_end - next_n_start
if (next_n_end < next_n) then
bottom_msg_length = current_n_end - next_n_end
else
bottom_msg_length = 0
endif
if (next_local_n > 0) then
next_top_msg_length = current_n_start - next_n_start
else
next_top_msg_length = 0
endif
if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
do i = 1, stripe_count
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
csw = min(stripe_width, thread_width-(i-1)*stripe_width) ! "current_stripe_width"
b_len = csw*nbw*max_threads
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, M_MPI_REAL_PRECISSION, my_prow+1, bottom_recv_tag, &
mpi_comm_rows, bottom_recv_request(i), mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! carefull the "recieve" has to be done at the corresponding wait or send
! bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, M_MPI_REAL_PRECISSION, my_prow+1, bottom_recv_tag, &
mpi_comm_rows, bottom_recv_request(i), mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! carefull the recieve has to be done at the corresponding wait or send
! bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i)
#endif /* WITH_MPI */
#endif /* WITH_OPENMP */
enddo
endif
if (current_local_n > 1) then
if (my_pcol == mod(sweep,np_cols)) then
bcast_buffer(:,1:current_local_n) = hh_trans_real(:,current_tv_off+1:current_tv_off+current_local_n)
current_tv_off = current_tv_off + current_local_n
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_bcast(bcast_buffer, nbw*current_local_n, M_MPI_REAL_PRECISSION, mod(sweep,np_cols), mpi_comm_cols, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
if (useGPU) then
successCUDA = cuda_memcpy(bcast_buffer_dev, loc(bcast_buffer(1,1)), &
nbw * current_local_n * M_size_of_PRECISSION_real , cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy"
stop
endif
call M_extract_hh_tau_real_gpu_PRECISSION(bcast_buffer_dev, hh_tau_dev, nbw, current_local_n, .false.)
call M_compute_hh_dot_products_real_gpu_PRECISSION(bcast_buffer_dev, hh_dot_dev, nbw, current_local_n)
endif
else ! (current_local_n > 1) then
! for current_local_n == 1 the one and only HH vector is 0 and not stored in hh_trans_real
bcast_buffer(:,1) = M_CONST_0_0
if (useGPU) then
successCUDA = cuda_memset(bcast_buffer_dev, 0, nbw * M_size_of_PRECISSION_real)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemset"
stop
endif
call M_extract_hh_tau_real_gpu_PRECISSION(bcast_buffer_dev, hh_tau_dev, nbw, 1, .true.)
endif
endif ! (current_local_n > 1) then
if (l_nev == 0) cycle
if (current_local_n > 0) then
do i = 1, stripe_count
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
! Get real stripe width for strip i;
! The last OpenMP tasks may have an even smaller stripe with,
! but we don't care about this, i.e. we send/recv a bit too much in this case.
! csw: current_stripe_width
csw = min(stripe_width, thread_width-(i-1)*stripe_width)
#endif /* WITH_OPENMP */
!wait_b
if (current_n_end < current_n) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
n_off = current_local_n+a_off
b_len = csw*nbw
b_off = (my_thread-1)*b_len
aIntern(1:csw,n_off+1:n_off+nbw,i,my_thread) = &
reshape(bottom_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, nbw /))
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
n_off = current_local_n+a_off
if (useGPU) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) *M_size_of_PRECISSION_real
successCUDA = cuda_memcpy( aIntern_dev + dev_offset , loc(bottom_border_recv_buffer(1,1,i)), &
stripe_width*nbw*M_size_of_PRECISSION_real ,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy"
stop
endif
else
aIntern(:,n_off+1:n_off+nbw,i) = bottom_border_recv_buffer(:,1:nbw,i)
endif
#endif /* WITH_OPENMP */
if (next_n_end < next_n) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, &
M_MPI_REAL_PRECISSION, my_prow+1, bottom_recv_tag, &
mpi_comm_rows, bottom_recv_request(i), mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WTIH_MPI */
! carefull the recieve has to be done at the corresponding wait or send
! bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, M_MPI_REAL_PRECISSION, my_prow+1, bottom_recv_tag, &
mpi_comm_rows, bottom_recv_request(i), mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
!! carefull the recieve has to be done at the corresponding wait or send
!! bottom_border_recv_buffer(1:stripe_width,1:nbw,i) = top_border_send_buffer(1:stripe_width,1:nbw,i)
#endif /* WITH_MPI */
#endif /* WITH_OPENMP */
endif
endif
if (current_local_n <= bottom_msg_length + top_msg_length) then
!wait_t
if (top_msg_length>0) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
#else /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
if (useGPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *M_size_of_PRECISSION_real
! host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ) ) * 8
successCUDA = cuda_memcpy( aIntern_dev+dev_offset , loc(top_border_recv_buffer(1,1,i)), &
stripe_width*top_msg_length*M_size_of_PRECISSION_real , cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy"
stop
endif
else
aIntern(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
endif ! useGPU
#endif /* WITH_OPENMP */
endif ! top_msg_length
!compute
#ifdef WITH_OPENMP
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif /* HAVE_DETAILED_TIMINGS */
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
if (top_msg_length>0) then
b_len = csw*top_msg_length
b_off = (my_thread-1)*b_len
aIntern(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
endif
call M_compute_hh_trafo_real_cpu_openmp_PRECISSION(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, &
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, kernel_flops, &
kernel_time, 0, current_local_n, i, my_thread, thread_width, &
THIS_REAL_ELPA_KERNEL)
! call compute_hh_trafo_real_cpu_openmp(aIntern,stripe_width,a_dim2,stripe_count, max_threads, l_nev, &
! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
!0, current_local_n, i, my_thread, thread_width, &
! THIS_REAL_ELPA_KERNEL)
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
call M_compute_hh_trafo_real_cpu_PRECISSION(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, 0, current_local_n, i, &
last_stripe_width, THIS_REAL_ELPA_KERNEL)
#endif /* WITH_OPENMP */
!send_b
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
if (bottom_msg_length>0) then
n_off = current_local_n+nbw-bottom_msg_length+a_off
b_len = csw*bottom_msg_length*max_threads
bottom_border_send_buffer(1:b_len,i) = &
reshape(aIntern(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Isend(bottom_border_send_buffer(1,i), b_len, M_MPI_REAL_PRECISSION, my_prow+1, &
top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
if (next_top_msg_length > 0) then
top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = bottom_border_send_buffer(1:csw* &
next_top_msg_length*max_threads,i)
endif
#endif /* WITH_MPI */
endif
#else /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
if (bottom_msg_length>0) then
n_off = current_local_n+nbw-bottom_msg_length+a_off
if (useGPU) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *M_size_of_PRECISSION_real
successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), aIntern_dev + dev_offset, &
stripe_width * bottom_msg_length * M_size_of_PRECISSION_real ,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy"
stop
endif
else
bottom_border_send_buffer(:,1:bottom_msg_length,i) = aIntern(:,n_off+1:n_off+bottom_msg_length,i)
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, M_MPI_REAL_PRECISSION, my_prow+1, &
top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
if (next_top_msg_length > 0) then
top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) = &
bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i)
endif
#endif /* WITH_MPI */
endif
#endif /* WITH_OPENMP */
else ! current_local_n <= bottom_msg_length + top_msg_length
!compute
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
call M_compute_hh_trafo_real_cpu_openmp_PRECISSION(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, &
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, kernel_flops, &
kernel_time, current_local_n - bottom_msg_length, &
bottom_msg_length, i, my_thread, thread_width, THIS_REAL_ELPA_KERNEL)
! call compute_hh_trafo_real_cpu_openmp(aIntern, stripe_width,a_dim2,stripe_count, max_threads, l_nev, &
! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
!current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, thread_width, &
! THIS_REAL_ELPA_KERNEL)
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif /* HAVE_DETAILED_TIMINGS */
!send_b
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
if (bottom_msg_length > 0) then
n_off = current_local_n+nbw-bottom_msg_length+a_off
b_len = csw*bottom_msg_length*max_threads
bottom_border_send_buffer(1:b_len,i) = &
reshape(aIntern(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Isend(bottom_border_send_buffer(1,i), b_len, M_MPI_REAL_PRECISSION, my_prow+1, &
top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
if (next_top_msg_length > 0) then
top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = bottom_border_send_buffer(1:csw* &
next_top_msg_length*&
max_threads,i)
endif
#endif /* WITH_MPI */
endif
#else /* WITH_OPENMP */
call M_compute_hh_trafo_real_cpu_PRECISSION(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, &
current_local_n - bottom_msg_length, bottom_msg_length, i, &
last_stripe_width, THIS_REAL_ELPA_KERNEL)
!send_b
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
if (bottom_msg_length > 0) then
n_off = current_local_n+nbw-bottom_msg_length+a_off
if (useGPU) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *M_size_of_PRECISSION_real
successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), aIntern_dev + dev_offset, &
stripe_width*bottom_msg_length*M_size_of_PRECISSION_real ,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error cudaMemcpy"
stop
endif
else
bottom_border_send_buffer(:,1:bottom_msg_length,i) = aIntern(:,n_off+1:n_off+bottom_msg_length,i)
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, M_MPI_REAL_PRECISSION, my_prow+1, &
top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
if (next_top_msg_length > 0) then
top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) = &
bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i)
endif
#endif /* WITH_MPI */
endif
#endif /* WITH_OPENMP */
!compute
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call M_compute_hh_trafo_real_cpu_openmp_PRECISSION(aIntern, aIntern_dev, stripe_width ,a_dim2, stripe_count, &
max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, &
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, kernel_flops, &
kernel_time, top_msg_length, &
current_local_n-top_msg_length-bottom_msg_length, i, my_thread, &
thread_width, THIS_REAL_ELPA_KERNEL)
! call compute_hh_trafo_real_cpu_openmp(aIntern, stripe_width, a_dim2,stripe_count, max_threads, l_nev, &
! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
! top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, my_thread, thread_width, &
! THIS_REAL_ELPA_KERNEL)
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
call M_compute_hh_trafo_real_cpu_PRECISSION(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, top_msg_length, &
current_local_n-top_msg_length-bottom_msg_length, i, &
last_stripe_width, THIS_REAL_ELPA_KERNEL)
#endif /* WITH_OPENMP */
!wait_t
if (top_msg_length>0) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
#else /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
if (useGPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *M_size_of_PRECISSION_real
successCUDA = cuda_memcpy( aIntern_dev + dev_offset , loc( top_border_recv_buffer(:,1,i)), &
stripe_width * top_msg_length *M_size_of_PRECISSION_real ,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy"
stop
endif
else
aIntern(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
endif
#endif /* WITH_OPENMP */
endif
!compute
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
if (top_msg_length>0) then
b_len = csw*top_msg_length
b_off = (my_thread-1)*b_len
aIntern(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
endif
call M_compute_hh_trafo_real_cpu_openmp_PRECISSION(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, &
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, kernel_flops, &
kernel_time, 0, top_msg_length, i, my_thread, thread_width, &
THIS_REAL_ELPA_KERNEL)
! call compute_hh_trafo_real_cpu_openmp(aIntern, stripe_width,a_dim2,stripe_count, max_threads, l_nev, &
! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
! 0, top_msg_length, i, my_thread, thread_width, THIS_REAL_ELPA_KERNEL)
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
call M_compute_hh_trafo_real_cpu_PRECISSION(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, 0, top_msg_length, i, &
last_stripe_width, THIS_REAL_ELPA_KERNEL)
#endif /* WITH_OPENMP */
endif
if (next_top_msg_length > 0) then
!request top_border data
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
b_len = csw*next_top_msg_length*max_threads
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Irecv(top_border_recv_buffer(1,i), b_len, M_MPI_REAL_PRECISSION, my_prow-1, &
top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! carefull the "recieve" has to be done at the corresponding wait or send
! top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = &
! bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i)
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, M_MPI_REAL_PRECISSION, my_prow-1, &
top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! carefull the "recieve" has to be done at the corresponding wait or send
! top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) = &
! bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i)
#endif /* WITH_MPI */
#endif /* WITH_OPENMP */
endif
!send_t
if (my_prow > 0) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
b_len = csw*nbw*max_threads
top_border_send_buffer(1:b_len,i) = reshape(aIntern(1:csw,a_off+1:a_off+nbw,i,:), (/ b_len /))
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Isend(top_border_send_buffer(1,i), b_len, M_MPI_REAL_PRECISSION, &
my_prow-1, bottom_recv_tag, &
mpi_comm_rows, top_send_request(i), mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
endif
if (next_n_end < next_n) then
bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
endif
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
if (useGPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * M_size_of_PRECISSION_real
successCUDA = cuda_memcpy( loc(top_border_send_buffer(:,1,i)), aIntern_dev + dev_offset, &
stripe_width*nbw*M_size_of_PRECISSION_real ,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy"
stop
endif
else
top_border_send_buffer(:,1:nbw,i) = aIntern(:,a_off+1:a_off+nbw,i)
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, M_MPI_REAL_PRECISSION, my_prow-1, bottom_recv_tag, &
mpi_comm_rows, top_send_request(i), mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i)
endif
if (next_n_end < next_n) then
bottom_border_recv_buffer(1:stripe_width,1:nbw,i) = top_border_send_buffer(1:stripe_width,1:nbw,i)
endif
#endif /* WITH_MPI */
#endif /* WITH_OPENMP */
endif
! Care that there are not too many outstanding top_recv_request's
if (stripe_count > 1) then
if (i>1) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
#else /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
#endif /* WITH_OPENMP */
else
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
#else /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
#endif /* WITH_OPENMP */
endif
endif
enddo
top_msg_length = next_top_msg_length
else
! wait for last top_send_request
do i = 1, stripe_count
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
#else /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
#endif /* WITH_OPENMP */
enddo
endif
! Care about the result
if (my_prow == 0) then
! topmost process sends nbw rows to destination processes
do j=0, nfact-1
num_blk = sweep*nfact+j ! global number of destination block, 0 based
if (num_blk*nblk >= na) exit
nbuf = mod(num_blk, num_result_buffers) + 1 ! buffer number to get this block
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
#else /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
#endif /* WITH_OPENMP */
dst = mod(num_blk, np_rows)
if (dst == 0) then
if (useGPU) then
row_group_size = min(na - num_blk*nblk, nblk)
call M_pack_row_group_real_gpu_PRECISSION(row_group_dev, aIntern_dev, stripe_count, stripe_width, &
last_stripe_width, a_dim2, l_nev, &
row_group(:, :), j * nblk + a_off, row_group_size)
do i = 1, row_group_size
q((num_blk / np_rows) * nblk + i, 1 : l_nev) = row_group(:, i)
enddo
else ! useGPU
do i = 1, min(na - num_blk*nblk, nblk)
#ifdef WITH_OPENMP
call M_pack_row_real_cpu_openmp_PRECISSION(aIntern, row, j*nblk+i+a_off, stripe_width, &
stripe_count, max_threads, thread_width, l_nev)
#else
call M_pack_row_real_cpu_PRECISSION(aIntern, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
#endif
q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:)
enddo
endif ! useGPU
else ! (dst == 0)
if (useGPU) then
call M_pack_row_group_real_gpu_PRECISSION(row_group_dev, aIntern_dev, stripe_count, stripe_width, &
last_stripe_width, a_dim2, l_nev, &
result_buffer(:, :, nbuf), j * nblk + a_off, nblk)
else ! useGPU
do i = 1, nblk
#ifdef WITH_OPENMP
call M_pack_row_real_cpu_openmp_PRECISSION(aIntern, result_buffer(:,i,nbuf), j*nblk+i+a_off, &
stripe_width, stripe_count, max_threads, thread_width, l_nev)
#else
call M_pack_row_real_cpu_PRECISSION(aIntern, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, &
last_stripe_width, stripe_count)
#endif
enddo
endif ! useGPU
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, M_MPI_REAL_PRECISSION, dst, &
result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
if (j+num_result_buffers < num_result_blocks) &
result_buffer(1:l_nev,1:nblk,nbuf) = result_buffer(1:l_nev,1:nblk,nbuf)
if (my_prow > 0 .and. l_nev>0) then
do j1 = 1, min(num_result_buffers, num_result_blocks)
result_buffer(1:l_nev,1:nblk,j1) = result_buffer(1:l_nev,1:nblk,nbuf)
enddo
endif
#endif /* WITH_MPI */
endif ! (dst == 0)
enddo !j=0, nfact-1
else ! (my_prow == 0)
! receive and store final result
do j = num_bufs_recvd, num_result_blocks-1
nbuf = mod(j, num_result_buffers) + 1 ! buffer number to get this block
! If there is still work to do, just test for the next result request
! and leave the loop if it is not ready, otherwise wait for all
! outstanding requests
if (next_local_n > 0) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
flag = .true.
#endif
#else /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
flag = .true.
#endif
#endif /* WITH_OPENMP */
if (.not.flag) exit
else ! (next_local_n > 0)
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
#else /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
#endif /* WITH_OPENMP */
endif ! (next_local_n > 0)
! Fill result buffer into q
num_blk = j*np_rows + my_prow ! global number of current block, 0 based
do i = 1, min(na - num_blk*nblk, nblk)
q(j*nblk+i, 1:l_nev) = result_buffer(1:l_nev, i, nbuf)
enddo
! Queue result buffer again if there are outstanding blocks left
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
if (j+num_result_buffers < num_result_blocks) &
call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, M_MPI_REAL_PRECISSION, 0, result_recv_tag, &
mpi_comm_rows, result_recv_request(nbuf), mpierr)
! carefull the "recieve" has to be done at the corresponding wait or send
! if (j+num_result_buffers < num_result_blocks) &
! result_buffer(1:l_nev*nblk,1,nbuf) = result_buffer(1:l_nev*nblk,1,nbuf)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
#endif /* WITH_MPI */
enddo ! j = num_bufs_recvd, num_result_blocks-1
num_bufs_recvd = j
endif ! (my_prow == 0)
! Shift the remaining rows to the front of aIntern (if necessary)
offset = nbw - top_msg_length
if (offset<0) then
if (wantDebug) write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_real: internal error, offset for shifting = ',offset
success = .false.
return
endif
a_off = a_off + offset
if (a_off + next_local_n + nbw > a_dim2) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread, i, j), schedule(static, 1)
do my_thread = 1, max_threads
do i = 1, stripe_count
do j = top_msg_length+1, top_msg_length+next_local_n
aIntern(:,j,i,my_thread) = aIntern(:,j+a_off,i,my_thread)
enddo
enddo
enddo
#else /* WITH_OPENMP */
do i = 1, stripe_count
if (useGPU) then
chunk = min(next_local_n - 1, a_off)
do j = top_msg_length + 1, top_msg_length + next_local_n, chunk
top = min(j + chunk, top_msg_length + next_local_n)
this_chunk = top - j + 1
dev_offset = (0 + ( (j-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *M_size_of_PRECISSION_real
dev_offset_1 = (0 + ( (j + a_off-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * &
M_size_of_PRECISSION_real
! it is not logical to set here always the value for the parameter
! "cudaMemcpyDeviceToDevice" do this ONCE at startup
! tmp = cuda_d2d(1)
successCUDA = cuda_memcpy( aIntern_dev + dev_offset , aIntern_dev +dev_offset_1, &
stripe_width*this_chunk*M_size_of_PRECISSION_real, cudaMemcpyDeviceToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error cudaMemcpy"
stop
endif
enddo
else ! not useGPU
do j = top_msg_length+1, top_msg_length+next_local_n
aIntern(:,j,i) = aIntern(:,j+a_off,i)
enddo
endif
enddo ! stripe_count
#endif /* WITH_OPENMP */
#ifdef WITH_OPENMP
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX)
#endif /* HAVE_DETAILED_TIMINGS */
#endif /* WITH_OPENMP */
a_off = 0
endif
enddo
! Just for safety:
#ifdef WITH_MPI
if (ANY(top_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_send_request ***',my_prow,my_pcol
if (ANY(bottom_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_send_request ***',my_prow,my_pcol
if (ANY(top_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_recv_request ***',my_prow,my_pcol
if (ANY(bottom_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_recv_request ***',my_prow,my_pcol
#endif
if (my_prow == 0) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef WITH_MPI
! allocate(mpi_statuses(MPI_STATUS_SIZE,num_result_buffers), stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"trans_ev_tridi_to_band_real: error when allocating mpi_statuses"//errorMessage
! stop
! endif
call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr)
! deallocate(mpi_statuses, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"trans_ev_tridi_to_band_real: error when deallocating mpi_statuses"//errorMessage
! stop
! endif
#endif
#else /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
#endif /* WITH_OPENMP */
endif
#ifdef WITH_MPI
if (ANY(result_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_send_request ***',my_prow,my_pcol
if (ANY(result_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_recv_request ***',my_prow,my_pcol
#ifdef HAVE_DETAILED_TIMINGS
call MPI_ALLREDUCE(kernel_flops, kernel_flops_recv, 1, MPI_INTEGER8, MPI_SUM, MPI_COMM_ROWS, mpierr)
kernel_flops = kernel_flops_recv
call MPI_ALLREDUCE(kernel_flops, kernel_flops_recv, 1, MPI_INTEGER8, MPI_SUM, MPI_COMM_COLS, mpierr)
kernel_flops = kernel_flops_recv
call MPI_ALLREDUCE(kernel_time, kernel_time_recv, 1, MPI_REAL8, MPI_MAX, MPI_COMM_ROWS, mpierr)
kernel_time_recv = kernel_time
call MPI_ALLREDUCE(kernel_time, kernel_time_recv, 1, MPI_REAL8, MPI_MAX, MPI_COMM_COLS, mpierr)
kernel_time_recv = kernel_time
#endif
#else /* WITH_MPI */
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,'(" Kernel time:",f10.3," MFlops: ",es12.5)') kernel_time, kernel_flops/kernel_time*1.d-6
#endif /* WITH_MPI */
! copy q to q_dev needed in trans_ev_band_to_full
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*M_size_of_PRECISSION_real)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMalloc"
stop
endif
! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)*M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMalloc"
stop
endif
! deallocate all working space
if (.not.(useGPU)) then
nullify(aIntern)
call free(aIntern_ptr)
! deallocate(aIntern, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"trans_ev_tridi_to_band_real: error when deallocating aIntern "//errorMessage
! stop
! endif
endif
deallocate(row, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating row "//errorMessage
stop
endif
deallocate(limits, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating limits"//errorMessage
stop
endif
deallocate(result_send_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating result_send_request "//errorMessage
stop
endif
deallocate(result_recv_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating result_recv_request "//errorMessage
stop
endif
deallocate(top_border_send_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating top_border_send_buffer "//errorMessage
stop
endif
deallocate(top_border_recv_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating top_border_recv_buffer "//errorMessage
stop
endif
deallocate(bottom_border_send_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating bottom_border_send_buffer "//errorMessage
stop
endif
deallocate(bottom_border_recv_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating bottom_border_recv_buffer "//errorMessage
stop
endif
deallocate(result_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating result_buffer "//errorMessage
stop
endif
deallocate(bcast_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating bcast_buffer "//errorMessage
stop
endif
deallocate(top_send_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating top_send_request "//errorMessage
stop
endif
deallocate(top_recv_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating top_recv_request "//errorMessage
stop
endif
deallocate(bottom_send_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating bottom_send_request "//errorMessage
stop
endif
deallocate(bottom_recv_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating bottom_recv_request "//errorMessage
stop
endif
if (useGPU) then
successCUDA = cuda_free(hh_dot_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaFree "//errorMessage
stop
endif
successCUDA = cuda_free(hh_tau_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaFree "//errorMessage
stop
endif
successCUDA = cuda_free(row_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaFree "//errorMessage
stop
endif
deallocate(row_group, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating row_group "//errorMessage
stop
endif
successCUDA = cuda_free(row_group_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaFree "//errorMessage
stop
endif
successCUDA = cuda_free(bcast_buffer_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaFree "//errorMessage
stop
endif
endif ! useGPU
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("trans_ev_tridi_to_band_real" + M_PRECISSION_SUFFIX)
#endif /* HAVE_DETAILED_TIMINGS */
return
end subroutine M_trans_ev_tridi_to_band_real_PRECISSION
#ifndef ALREADY_DEFINED_WORKLOAD
subroutine determine_workload(na, nb, nprocs, limits)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
integer(kind=ik), intent(in) :: na, nb, nprocs
integer(kind=ik), intent(out) :: limits(0:nprocs)
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("determine_workload")
#endif
if (na <= 0) then
limits(:) = 0
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("determine_workload")
#endif
return
endif
if (nb*nprocs > na) then
! there is not enough work for all
do i = 0, nprocs
limits(i) = min(na, i*nb)
enddo
else
do i = 0, nprocs
limits(i) = (i*na)/nprocs
enddo
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("determine_workload")
#endif
end subroutine
!---------------------------------------------------------------------------------------------------
! divide_band: sets the work distribution in band
! Proc n works on blocks block_limits(n)+1 .. block_limits(n+1)
subroutine divide_band(nblocks_total, n_pes, block_limits)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
integer(kind=ik), intent(in) :: nblocks_total ! total number of blocks in band
integer(kind=ik), intent(in) :: n_pes ! number of PEs for division
integer(kind=ik), intent(out) :: block_limits(0:n_pes)
integer(kind=ik) :: n, nblocks, nblocks_left
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("divide_band")
#endif
block_limits(0) = 0
if (nblocks_total < n_pes) then
! Not enough work for all: The first tasks get exactly 1 block
do n=1,n_pes
block_limits(n) = min(nblocks_total,n)
enddo
else
! Enough work for all. If there is no exact loadbalance,
! the LAST tasks get more work since they are finishing earlier!
nblocks = nblocks_total/n_pes
nblocks_left = nblocks_total - n_pes*nblocks
do n=1,n_pes
if (n<=n_pes-nblocks_left) then
block_limits(n) = block_limits(n-1) + nblocks
else
block_limits(n) = block_limits(n-1) + nblocks + 1
endif
enddo
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("divide_band")
#endif
end subroutine
#define ALREADY_DEFINED_WORKLOAD 1
#endif /* ALREADY_DEFINED_WORKLOAD */
subroutine M_band_band_real_PRECISSION(na, nb, nbCol, nb2, nb2Col, ab, ab2, d, e, mpi_comm)
!-------------------------------------------------------------------------------
! band_band_real:
! Reduces a real symmetric banded matrix to a real symmetric matrix with smaller bandwidth. Householder transformations are not stored.
! Matrix size na and original bandwidth nb have to be a multiple of the target bandwidth nb2. (Hint: expand your matrix with
! zero entries, if this
! requirement doesn't hold)
!
! na Order of matrix
!
! nb Semi bandwidth of original matrix
!
! nb2 Semi bandwidth of target matrix
!
! ab Input matrix with bandwidth nb. The leading dimension of the banded matrix has to be 2*nb. The parallel data layout
! has to be accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb+1 to min(na, block_limits(n+1)*nb)
! are located on rank n.
!
! ab2 Output matrix with bandwidth nb2. The leading dimension of the banded matrix is 2*nb2. The parallel data layout is
! accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb2+1 to min(na, block_limits(n+1)*nb2) are located
! on rank n.
!
! d(na) Diagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output)
!
! e(na) Subdiagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output)
!
! mpi_comm
! MPI-Communicator for the total processor set
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
integer(kind=ik), intent(in) :: na, nb, nbCol, nb2, nb2Col, mpi_comm
real(kind=REAL_DATATYPE), intent(inout) :: ab(2*nb,nbCol) ! removed assumed size
real(kind=REAL_DATATYPE), intent(inout) :: ab2(2*nb2,nb2Col) ! removed assumed size
real(kind=REAL_DATATYPE), intent(out) :: d(na), e(na) ! set only on PE 0
real(kind=REAL_DATATYPE) :: hv(nb,nb2), w(nb,nb2), w_new(nb,nb2), tau(nb2), hv_new(nb,nb2), &
tau_new(nb2), ab_s(1+nb,nb2), ab_r(1+nb,nb2), ab_s2(2*nb2,nb2), hv_s(nb,nb2)
real(kind=REAL_DATATYPE) :: work(nb*nb2), work2(nb2*nb2)
integer(kind=ik) :: lwork, info
integer(kind=ik) :: istep, i, n, dest
integer(kind=ik) :: n_off, na_s
integer(kind=ik) :: my_pe, n_pes, mpierr
integer(kind=ik) :: nblocks_total, nblocks
integer(kind=ik) :: nblocks_total2, nblocks2
integer(kind=ik) :: ireq_ab, ireq_hv
#ifdef WITH_MPI
! integer(kind=ik) :: MPI_STATUS_IGNORE(MPI_STATUS_SIZE)
#endif
! integer(kind=ik), allocatable :: mpi_statuses(:,:)
integer(kind=ik), allocatable :: block_limits(:), block_limits2(:), ireq_ab2(:)
integer(kind=ik) :: j, nc, nr, ns, ne, iblk
integer(kind=ik) :: istat
character(200) :: errorMessage
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("band_band_real" + M_PRECISSION_SUFFIX)
#endif
! if (na .lt. 2*nb) then
! print *,"na lt 2*nb ",na,2*nb
! stop
! endif
! if (na .lt. 2*nb2) then
! print *,"na lt 2*nb2 ",na,2*nb2
! stop
! endif
! if (na .lt. nbCol) then
! print *,"na lt nbCol ",na,nbCol
! stop
! endif
! if (na .lt. nb2Col) then
! print *,"na lt nb2Col ",na,nb2Col
! stop
! endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_comm_rank(mpi_comm,my_pe,mpierr)
call mpi_comm_size(mpi_comm,n_pes,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
! Total number of blocks in the band:
nblocks_total = (na-1)/nb + 1
nblocks_total2 = (na-1)/nb2 + 1
! Set work distribution
allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"error allocating block_limits "//errorMessage
stop
endif
call divide_band(nblocks_total, n_pes, block_limits)
allocate(block_limits2(0:n_pes), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"error allocating block_limits2 "//errorMessage
stop
endif
call divide_band(nblocks_total2, n_pes, block_limits2)
! nblocks: the number of blocks for my task
nblocks = block_limits(my_pe+1) - block_limits(my_pe)
nblocks2 = block_limits2(my_pe+1) - block_limits2(my_pe)
allocate(ireq_ab2(1:nblocks2), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"error allocating ireq_ab2 "//errorMessage
stop
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
ireq_ab2 = MPI_REQUEST_NULL
if (nb2>1) then
do i=0,nblocks2-1
call mpi_irecv(ab2(1,i*nb2+1), 2*nb2*nb2, M_MPI_REAL_PRECISSION, 0, 3, mpi_comm, ireq_ab2(i+1), mpierr)
enddo
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! carefull the "recieve" has to be done at the corresponding send or wait
! if (nb2>1) then
! do i=0,nblocks2-1
! ab2(1:2*nb2*nb2,i*nb2+1:i*nb2+1+nb2-1) = ab_s2(1:2*nb2,i*nb2+1:nb2)
! enddo
! endif
#endif /* WITH_MPI */
! n_off: Offset of ab within band
n_off = block_limits(my_pe)*nb
lwork = nb*nb2
dest = 0
#ifdef WITH_MPI
ireq_ab = MPI_REQUEST_NULL
ireq_hv = MPI_REQUEST_NULL
#endif
! ---------------------------------------------------------------------------
! Start of calculations
na_s = block_limits(my_pe)*nb + 1
if (my_pe>0 .and. na_s<=na) then
! send first nb2 columns to previous PE
! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
do i=1,nb2
ab_s(1:nb+1,i) = ab(1:nb+1,na_s-n_off+i-1)
enddo
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_isend(ab_s, (nb+1)*nb2, M_MPI_REAL_PRECISSION, my_pe-1, 1, mpi_comm, ireq_ab, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
endif
do istep=1,na/nb2
if (my_pe==0) then
n = MIN(na-na_s-nb2+1,nb) ! number of rows to be reduced
hv(:,:) = M_CONST_0_0
tau(:) = M_CONST_0_0
! The last step (istep=na-1) is only needed for sending the last HH vectors.
! We don't want the sign of the last element flipped (analogous to the other sweeps)
if (istep < na/nb2) then
! Transform first block column of remaining matrix
call M_PRECISSION_GEQRF(n, nb2, ab(1+nb2,na_s-n_off), 2*nb-1, tau, work, lwork, info)
do i=1,nb2
hv(i,i) = M_CONST_1_0
hv(i+1:n,i) = ab(1+nb2+1:1+nb2+n-i,na_s-n_off+i-1)
ab(1+nb2+1:2*nb,na_s-n_off+i-1) = M_CONST_0_0
enddo
endif
if (nb2==1) then
d(istep) = ab(1,na_s-n_off)
e(istep) = ab(2,na_s-n_off)
if (istep == na) then
e(na) = M_CONST_0_0
endif
else
ab_s2 = M_CONST_0_0
ab_s2(:,:) = ab(1:nb2+1,na_s-n_off:na_s-n_off+nb2-1)
if (block_limits2(dest+1)1) then
do i= 0,nblocks2-1
ab2(1:2*nb2*nb2,i*nb2+1:i+nb2+1+nb2-1) = ab_s2(1:2*nb2,1:nb2)
enddo
endif
#endif /* WITH_MPI */
endif
else
if (na>na_s+nb2-1) then
! Receive Householder vectors from previous task, from PE owning subdiagonal
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_recv(hv, nb*nb2, M_MPI_REAL_PRECISSION, my_pe-1, 2, mpi_comm, MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
hv(1:nb,1:nb2) = hv_s(1:nb,1:nb2)
#endif /* WITH_MPI */
do i=1,nb2
tau(i) = hv(i,i)
hv(i,i) = M_CONST_1_0
enddo
endif
endif
na_s = na_s+nb2
if (na_s-n_off > nb) then
ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
ab(:,nblocks*nb+1:(nblocks+1)*nb) = M_CONST_0_0
n_off = n_off + nb
endif
do iblk=1,nblocks
ns = na_s + (iblk-1)*nb - n_off ! first column in block
ne = ns+nb-nb2 ! last column in block
if (ns+n_off>na) exit
nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
! Note that nr>=0 implies that diagonal block is full (nc==nb)!
call M_wy_gen_PRECISSION(nc,nb2,w,hv,tau,work,nb)
if (iblk==nblocks .and. nc==nb) then
!request last nb2 columns
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_recv(ab_r,(nb+1)*nb2, M_MPI_REAL_PRECISSION, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
ab_r(1:nb+1,1:nb2) = ab_s(1:nb+1,1:nb2)
#endif /* WITH_MPI */
do i=1,nb2
ab(1:nb+1,ne+i-1) = ab_r(:,i)
enddo
endif
hv_new(:,:) = M_CONST_0_0 ! Needed, last rows must be 0 for nr < nb
tau_new(:) = M_CONST_0_0
if (nr>0) then
call M_wy_right_PRECISSION(nr,nb,nb2,ab(nb+1,ns),2*nb-1,w,hv,work,nb)
call M_PRECISSION_GEQRF(nr, nb2, ab(nb+1,ns), 2*nb-1, tau_new, work, lwork, info)
do i=1,nb2
hv_new(i,i) = M_CONST_1_0
hv_new(i+1:,i) = ab(nb+2:2*nb-i+1,ns+i-1)
ab(nb+2:,ns+i-1) = M_CONST_0_0
enddo
!send hh-vector
if (iblk==nblocks) then
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
hv_s = hv_new
do i=1,nb2
hv_s(i,i) = tau_new(i)
enddo
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_isend(hv_s,nb*nb2, M_MPI_REAL_PRECISSION, my_pe+1, 2, mpi_comm, ireq_hv, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
#endif /* WITH_MPI */
endif
endif
call M_wy_symm_PRECISSION(nc,nb2,ab(1,ns),2*nb-1,w,hv,work,work2,nb)
if (my_pe>0 .and. iblk==1) then
!send first nb2 columns to previous PE
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
do i=1,nb2
ab_s(1:nb+1,i) = ab(1:nb+1,ns+i-1)
enddo
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_isend(ab_s,(nb+1)*nb2, M_MPI_REAL_PRECISSION, my_pe-1, 1, mpi_comm, ireq_ab, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
#endif /* WITH_MPI */
endif
if (nr>0) then
call M_wy_gen_PRECISSION(nr,nb2,w_new,hv_new,tau_new,work,nb)
call M_wy_left_PRECISSION(nb-nb2,nr,nb2,ab(nb+1-nb2,ns+nb2),2*nb-1,w_new,hv_new,work,nb)
endif
! Use new HH vector for the next block
hv(:,:) = hv_new(:,:)
tau = tau_new
enddo
enddo
! Finish the last outstanding requests
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
! allocate(mpi_statuses(MPI_STATUS_SIZE,nblocks2), stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"error allocating mpi_statuses "//errorMessage
! stop
! endif
call mpi_waitall(nblocks2,ireq_ab2,MPI_STATUSES_IGNORE,mpierr)
! deallocate(mpi_statuses, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"error deallocating mpi_statuses "//errorMessage
! stop
! endif
call mpi_barrier(mpi_comm,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
deallocate(block_limits, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"error deallocating block_limits "//errorMessage
stop
endif
deallocate(block_limits2, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"error deallocating block_limits2 "//errorMessage
stop
endif
deallocate(ireq_ab2, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"error deallocating ireq_ab2 "//errorMessage
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("band_band_real" + M_PRECISSION_SUFFIX)
#endif
end subroutine
subroutine M_wy_gen_PRECISSION(n, nb, W, Y, tau, mem, lda)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
integer(kind=ik), intent(in) :: n !length of householder-vectors
integer(kind=ik), intent(in) :: nb !number of householder-vectors
integer(kind=ik), intent(in) :: lda !leading dimension of Y and W
real(kind=REAL_DATATYPE), intent(in) :: Y(lda,nb) !matrix containing nb householder-vectors of length b
real(kind=REAL_DATATYPE), intent(in) :: tau(nb) !tau values
real(kind=REAL_DATATYPE), intent(out) :: W(lda,nb) !output matrix W
real(kind=REAL_DATATYPE), intent(in) :: mem(nb) !memory for a temporary matrix of size nb
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("wy_gen" + M_PRECISSION_SUFFIX)
#endif
W(1:n,1) = tau(1)*Y(1:n,1)
do i=2,nb
W(1:n,i) = tau(i)*Y(1:n,i)
call M_PRECISSION_GEMV('T', n, i-1, M_CONST_1_0, Y, lda, W(1,i), 1, M_CONST_0_0, mem,1)
call M_PRECISSION_GEMV('N', n, i-1, -M_CONST_1_0, W, lda, mem, 1, M_CONST_1_0, W(1,i),1)
enddo
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("wy_gen" + M_PRECISSION_SUFFIX)
#endif
end subroutine
subroutine M_wy_left_PRECISSION(n, m, nb, A, lda, W, Y, mem, lda2)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
integer(kind=ik), intent(in) :: n !width of the matrix A
integer(kind=ik), intent(in) :: m !length of matrix W and Y
integer(kind=ik), intent(in) :: nb !width of matrix W and Y
integer(kind=ik), intent(in) :: lda !leading dimension of A
integer(kind=ik), intent(in) :: lda2 !leading dimension of W and Y
real(kind=REAL_DATATYPE), intent(inout) :: A(lda,*) !matrix to be transformed ! remove assumed size
real(kind=REAL_DATATYPE), intent(in) :: W(m,nb) !blocked transformation matrix W
real(kind=REAL_DATATYPE), intent(in) :: Y(m,nb) !blocked transformation matrix Y
real(kind=REAL_DATATYPE), intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("wy_left" + M_PRECISSION_SUFFIX)
#endif
call M_PRECISSION_GEMM('T', 'N', nb, n, m, M_CONST_1_0, W, lda2, A, lda, M_CONST_0_0, mem, nb)
call M_PRECISSION_GEMM('N', 'N', m, n, nb, -M_CONST_1_0, Y, lda2, mem, nb, M_CONST_1_0, A, lda)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("wy_left" + M_PRECISSION_SUFFIX)
#endif
end subroutine
subroutine M_wy_right_PRECISSION(n, m, nb, A, lda, W, Y, mem, lda2)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
integer(kind=ik), intent(in) :: n !height of the matrix A
integer(kind=ik), intent(in) :: m !length of matrix W and Y
integer(kind=ik), intent(in) :: nb !width of matrix W and Y
integer(kind=ik), intent(in) :: lda !leading dimension of A
integer(kind=ik), intent(in) :: lda2 !leading dimension of W and Y
real(kind=REAL_DATATYPE), intent(inout) :: A(lda,*) !matrix to be transformed ! remove assumed size
real(kind=REAL_DATATYPE), intent(in) :: W(m,nb) !blocked transformation matrix W
real(kind=REAL_DATATYPE), intent(in) :: Y(m,nb) !blocked transformation matrix Y
real(kind=REAL_DATATYPE), intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("wy_right" + M_PRECISSION_SUFFIX)
#endif
call M_PRECISSION_GEMM('N', 'N', n, nb, m, M_CONST_1_0, A, lda, W, lda2, M_CONST_0_0, mem, n)
call M_PRECISSION_GEMM('N', 'T', n, m, nb, -M_CONST_1_0, mem, n, Y, lda2, M_CONST_1_0, A, lda)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("wy_right" + M_PRECISSION_SUFFIX)
#endif
end subroutine
subroutine M_wy_symm_PRECISSION(n, nb, A, lda, W, Y, mem, mem2, lda2)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
integer(kind=ik), intent(in) :: n !width/heigth of the matrix A; length of matrix W and Y
integer(kind=ik), intent(in) :: nb !width of matrix W and Y
integer(kind=ik), intent(in) :: lda !leading dimension of A
integer(kind=ik), intent(in) :: lda2 !leading dimension of W and Y
real(kind=REAL_DATATYPE), intent(inout) :: A(lda,*) !matrix to be transformed ! remove assumed size
real(kind=REAL_DATATYPE), intent(in) :: W(n,nb) !blocked transformation matrix W
real(kind=REAL_DATATYPE), intent(in) :: Y(n,nb) !blocked transformation matrix Y
real(kind=REAL_DATATYPE) :: mem(n,nb) !memory for a temporary matrix of size n x nb
real(kind=REAL_DATATYPE) :: mem2(nb,nb) !memory for a temporary matrix of size nb x nb
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("wy_symm" + M_PRECISSION_SUFFIX)
#endif
call M_PRECISSION_SYMM('L', 'L', n, nb, M_CONST_1_0, A, lda, W, lda2, M_CONST_0_0, mem, n)
call M_PRECISSION_GEMM('T', 'N', nb, nb, n, M_CONST_1_0, mem, n, W, lda2, M_CONST_0_0, mem2, nb)
call M_PRECISSION_GEMM('N', 'N', n, nb, nb, -M_CONST_0_5, Y, lda2, mem2, nb, M_CONST_1_0, mem, n)
call M_PRECISSION_SYR2K('L', 'N', n, nb, -M_CONST_1_0, Y, lda2, mem, n, M_CONST_1_0, A, lda)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("wy_symm" + M_PRECISSION_SUFFIX)
#endif
end subroutine