#if 0
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), fomerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
! This particular source code file contains additions, changes and
! enhancements authored by Intel Corporation which is not part of
! the ELPA consortium.
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine bandred_complex_double(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, wantDebug, &
useGPU, success)
#else
subroutine bandred_complex_single(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, wantDebug, &
useGPU, success)
#endif
!-------------------------------------------------------------------------------
! bandred_complex: Reduces a distributed hermitian matrix to band form
!
! Parameters
!
! na Order of matrix
!
! a(lda,matrixCols) Distributed matrix which should be reduced.
! Distribution is like in Scalapack.
! Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
! a(:,:) is overwritten on exit with the band and the Householder vectors
! in the upper half.
!
! lda Leading dimension of a
! matrixCols local columns of matrix a
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! nbw semi bandwith of output matrix
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
!
! tmat(nbw,nbw,numBlocks) where numBlocks = (na-1)/nbw + 1
! Factors for the Householder vectors (returned), needed for back transformation
!
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
use cuda_functions
use iso_c_binding
implicit none
logical, intent(in) :: useGPU
integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE) :: a(lda,*), tmat(nbw,nbw,*)
#else
complex(kind=COMPLEX_DATATYPE) :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
complex(kind=COMPLEX_DATATYPE), parameter :: CZERO = (0.0_rk8, 0.0_rk8), CONE = (1.0_rk8, 0.0_rk8)
#else
complex(kind=COMPLEX_DATATYPE), parameter :: CZERO = (0.0_rk4, 0.0_rk4), CONE = (1.0_rk4, 0.0_rk4)
#endif
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows
integer(kind=ik) :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
integer(kind=ik) :: istep, ncol, lch, lcx, nlc
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
real(kind=REAL_DATATYPE) :: vnorm2
complex(kind=COMPLEX_DATATYPE) :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
complex(kind=COMPLEX_DATATYPE), allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:)
integer(kind=c_intptr_t) :: umc_dev, tmat_dev,vav_dev,vmr_dev,a_dev
integer(kind=ik) :: cur_l_rows, cur_l_cols,vmr_size ,umc_size
integer(kind=c_size_t) :: lc_start, lc_end, lr_end, lce_1, lcs_1,lre_1
integer(kind=ik) :: na_rows, na_cols
#ifdef WITH_MPI
integer(kind=ik), external :: numroc
#endif
logical, intent(in) :: wantDebug
logical, intent(out) :: success
character(200) :: errorMessage
integer(kind=ik) :: istat
logical :: successCUDA
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("bandred_complex_double")
#else
call timer%start("bandred_complex_single")
#endif
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
success = .true.
! Semibandwith nbw must be a multiple of blocksize nblk
if (mod(nbw,nblk)/=0) then
if (my_prow==0 .and. my_pcol==0) then
if (wantDebug) then
write(error_unit,*) 'ELPA2_bandred_complex: ERROR: nbw=',nbw,', nblk=',nblk
write(error_unit,*) 'ELPA2_bandred_complex: ELPA2 works only for nbw==n*nblk'
endif
success = .false.
return
endif
endif
if (useGPU) then
#ifdef WITH_MPI
na_rows = numroc(na, nblk, my_prow, 0, np_rows)
na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
na_rows = na
na_cols = na
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_double_complex_datatype)
#else
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_single_complex_datatype)
#endif
if (.not.(successCUDA)) then
print *, " bandred_complex: cuda malloc failed tmat_dev ", istat
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_malloc(vav_dev, nbw*nbw*size_of_double_complex_datatype)
#else
successCUDA = cuda_malloc(vav_dev, nbw*nbw*size_of_single_complex_datatype)
#endif
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda malloc failed vav_dev ", istat
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_malloc(a_dev, lda*na_cols*size_of_double_complex_datatype)
#else
successCUDA = cuda_malloc(a_dev, lda*na_cols*size_of_single_complex_datatype)
#endif
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda malloc failed a_dev ", istat
stop
endif
endif ! useGPU
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
l_rows_tile = tile_size/np_rows ! local rows of a tile
l_cols_tile = tile_size/np_cols ! local cols of a tile
if (useGPU) then
if (size(a,dim=1) .ne. lda .or. size(a,dim=2) .ne. na_cols) then
print *,"bandred_complex: sizes of a wrong ? ",lda,size(a,dim=1),na_cols,size(a,dim=2)
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(a_dev, loc(a(1,1)),(lda)*(na_cols)*size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(a_dev, loc(a(1,1)),(lda)*(na_cols)*size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda memcpy faild a_dev ", istat
stop
endif
endif
do istep = (na-1)/nbw, 1, -1
n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
! Number of local columns/rows of remaining matrix
l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
if (useGPU) then
cur_l_rows = max(l_rows, 1)
cur_l_cols = max(l_cols, 1)
vmr_size = cur_l_rows * 2 * n_cols
umc_size = cur_l_cols * 2 * n_cols
if ((.not. allocated(umc)) .or. (umc_size .gt. ubound(umc, dim=1))) then
if (allocated(umc)) then
deallocate(umc, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating umc "//errorMessage
stop
endif
successCUDA = cuda_free(umc_dev)
if (.not.(successCUDA))then
print *,"bandred_complex: error in cudaFree"
stop
endif
endif
allocate(umc(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating umc "//errorMessage
stop
endif
if (max(l_cols,1) * 2*n_cols .gt. umc_size) then
print *,"bandred_complex: umc_size ",max(l_cols,1) * 2*n_cols,umc_size
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_malloc(umc_dev, umc_size*size_of_double_complex_datatype)
#else
successCUDA = cuda_malloc(umc_dev, umc_size*size_of_single_complex_datatype)
#endif
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda malloc failed umc_dev ", istat
stop
endif
endif
if ((.not. allocated(vmr)) .or. (vmr_size .gt. ubound(vmr, dim=1))) then
if (allocated(vmr)) then
deallocate(vmr, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when deallocating vmr "//errorMessage
stop
endif
successCUDA = cuda_free(vmr_dev)
if (.not.(successCUDA))then
print *,"bandred_complex: error in cudaFree"
stop
endif
endif
allocate(vmr(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating vmr "//errorMessage
stop
endif
if (max(l_rows,1) * 2*n_cols .gt. vmr_size) then
print *,"bandred_complex: vmc_size ",max(l_rows,1) * 2*n_cols,vmr_size
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_malloc(vmr_dev, vmr_size*size_of_double_complex_datatype)
#else
successCUDA = cuda_malloc(vmr_dev, vmr_size*size_of_single_complex_datatype)
#endif
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda malloc failed vmr_dev ", istat
stop
endif
endif
if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
if (allocated(vr)) then
deallocate(vr, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when deallocating vr "//errorMessage
stop
endif
endif
allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating vr "//errorMessage
stop
endif
endif
else ! GPU not used
allocate(vmr(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating vmr "//errorMessage
stop
endif
allocate(umc(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating umc "//errorMessage
stop
endif
allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating vr "//errorMessage
stop
endif
endif ! useGPU
#ifdef DOUBLE_PRECISION_COMLEX
vmr(1:l_rows,1:n_cols) = 0._ck8
vr(:) = 0._ck8
tmat(:,:,istep) = 0._ck8
#else
vmr(1:l_rows,1:n_cols) = 0._ck4
vr(:) = 0._ck4
tmat(:,:,istep) = 0._ck4
#endif
if (useGPU) then
lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
lc_end = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
lr_end = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)
if (lc_start .le. 0) lc_start = 1
cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if (my_pcol == cur_pcol) then
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), int(lda*size_of_double_complex_datatype,kind=c_size_t), &
(a_dev + int( ( (lc_start-1) * lda*size_of_double_complex_datatype),kind=c_size_t )), &
int(lda*size_of_double_complex_datatype,kind=c_size_t), &
int(lr_end*size_of_double_complex_datatype,kind=c_size_t), &
int((lc_end - lc_start+1),kind=c_size_t),int(cudaMemcpyDeviceToHost,kind=c_int))
#else
successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), int(lda*size_of_single_complex_datatype,kind=c_size_t), &
(a_dev + int( ( (lc_start-1) * lda*size_of_single_complex_datatype),kind=c_size_t )), &
int(lda*size_of_single_complex_datatype,kind=c_size_t), &
int(lr_end*size_of_single_complex_datatype,kind=c_size_t), &
int((lc_end - lc_start+1),kind=c_size_t),int(cudaMemcpyDeviceToHost,kind=c_int))
#endif
if (.not.(successCUDA)) then
print *, "bandred_complex: error in cudaMemcpy2"
stop
endif
endif
endif
! Reduce current block to lower triangular form
do lc = n_cols, 1, -1
ncol = istep*nbw + lc ! absolute column number of householder vector
nrow = ncol - nbw ! Absolute number of pivot row
lr = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
tau = 0
if(nrow == 1) exit ! Nothing to do
cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
if (my_pcol==cur_pcol) then
! Get vector to be transformed; distribute last element and norm of
! remaining elements to all procs in current column
vr(1:lr) = a(1:lr,lch) ! vector to be transformed
if (my_prow==prow(nrow, nblk, np_rows)) then
aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
aux1(2) = vr(lr)
else
aux1(1) = dot_product(vr(1:lr),vr(1:lr))
#ifdef DOUBLE_PRECISION_COMPLEX
aux1(2) = 0._ck8
#else
aux1(2) = 0._ck4
#endif
endif
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_allreduce(aux1, aux2, 2, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
call mpi_allreduce(aux1, aux2, 2, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif
#else /* WITH_MPI */
aux2 = aux1
#endif /* WITH_MPI */
vnorm2 = aux2(1)
vrl = aux2(2)
! Householder transformation
#ifdef DOUBLE_PRECISION_COMPLEX
call hh_transform_complex_double(vrl, vnorm2, xf, tau)
#else
call hh_transform_complex_single(vrl, vnorm2, xf, tau)
#endif
! Scale vr and store Householder vector for back transformation
vr(1:lr) = vr(1:lr) * xf
if (my_prow==prow(nrow, nblk, np_rows)) then
a(1:lr-1,lch) = vr(1:lr-1)
a(lr,lch) = vrl
#ifdef DOUBLE_PRECISION_COMPLEX
vr(lr) = 1._ck8
#else
vr(lr) = 1._ck4
#endif
else
a(1:lr,lch) = vr(1:lr)
endif
endif
! Broadcast Householder vector and tau along columns
vr(lr+1) = tau
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Bcast(vr, lr+1, MPI_DOUBLE_COMPLEX, cur_pcol, mpi_comm_cols, mpierr)
#else
call MPI_Bcast(vr, lr+1, MPI_COMPLEX, cur_pcol, mpi_comm_cols, mpierr)
#endif
#endif /* WITH_MPI */
vmr(1:lr,lc) = vr(1:lr)
tau = vr(lr+1)
tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
! Transform remaining columns in current block with Householder vector
! Local dot product
#ifdef DOUBLE_PRECISION_COMPLEX
aux1 = 0._ck8
#else
aux1 = 0._ck4
#endif
nlc = 0 ! number of local columns
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
endif
enddo
! Get global dot products
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif
#else /* WITH_MPI */
if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
endif
enddo
enddo
! Calculate scalar products of stored Householder vectors.
! This can be done in different ways, we use zherk
if (useGPU) then
cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if (my_pcol == cur_pcol) then
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy2d((a_dev+int(((lc_start-1)*lda*size_of_double_complex_datatype),kind=c_size_t)), &
int(lda*size_of_double_complex_datatype,kind=c_size_t), loc(a(1,lc_start)), &
int(lda*size_of_double_complex_datatype,kind=c_size_t), &
int(lr_end*size_of_double_complex_datatype,kind=c_size_t), &
int((lc_end - lc_start+1),kind=c_size_t) &
,int(cudaMemcpyHostToDevice,kind=c_int))
#else
successCUDA = cuda_memcpy2d((a_dev+int(((lc_start-1)*lda*size_of_single_complex_datatype),kind=c_size_t)), &
int(lda*size_of_single_complex_datatype,kind=c_size_t), loc(a(1,lc_start)), &
int(lda*size_of_single_complex_datatype,kind=c_size_t), &
int(lr_end*size_of_single_complex_datatype,kind=c_size_t), &
int((lc_end - lc_start+1),kind=c_size_t) &
,int(cudaMemcpyHostToDevice,kind=c_int))
#endif
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda memcpy a_dev failed ", istat
stop
endif
endif
endif
vav = 0
if (l_rows>0) &
#ifdef DOUBLE_PRECISION_COMPLEX
call zherk('U', 'C', n_cols, l_rows, CONE, vmr, ubound(vmr,dim=1), CZERO, vav, ubound(vav,dim=1))
call herm_matrix_allreduce_double(n_cols,vav, nbw,nbw,mpi_comm_rows)
#else
call cherk('U', 'C', n_cols, l_rows, CONE, vmr, ubound(vmr,dim=1), CZERO, vav, ubound(vav,dim=1))
call herm_matrix_allreduce_single(n_cols,vav, nbw,nbw,mpi_comm_rows)
#endif
! Calculate triangular matrix T for block Householder Transformation
do lc=n_cols,1,-1
tau = tmat(lc,lc,istep)
if (lc vmc (stored in umc, second half)
#ifdef DOUBLE_PRECISION_COMPLEX
call elpa_transpose_vectors_complex_double (vmr, ubound(vmr,dim=1), mpi_comm_rows, &
umc(1,n_cols+1), ubound(umc,dim=1), mpi_comm_cols, &
1, istep*nbw, n_cols, nblk)
#else
call elpa_transpose_vectors_complex_single (vmr, ubound(vmr,dim=1), mpi_comm_rows, &
umc(1,n_cols+1), ubound(umc,dim=1), mpi_comm_cols, &
1, istep*nbw, n_cols, nblk)
#endif
! Calculate umc = A**T * vmr
! Note that the distributed A has to be transposed
! Opposed to direct tridiagonalization there is no need to use the cache locality
! of the tiles, so we can use strips of the matrix
#ifdef DOUBLE_PRECISION_COMPLEX
umc(1:l_cols,1:n_cols) = 0.0_ck8
vmr(1:l_rows,n_cols+1:2*n_cols) = 0._ck8
#else
umc(1:l_cols,1:n_cols) = 0.0_ck4
vmr(1:l_rows,n_cols+1:2*n_cols) = 0._ck4
#endif
if (l_cols>0 .and. l_rows>0) then
if (useGPU) then
if (size(vmr,dim=1)*size(vmr,dim=2) .gt. vmr_size) then
print *,"bandred_complex: vmr size 2 :",size(vmr,dim=1)*size(vmr,dim=2),vmr_size
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(vmr_dev, loc(vmr(1,1)),vmr_size*size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(vmr_dev, loc(vmr(1,1)),vmr_size*size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda memcpy vmr_dev failed ", istat
stop
endif
if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then
print *,"bandred_complex: umc size 2 :",size(umc,dim=1)*size(umc,dim=2),umc_size
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(umc_dev, loc(umc(1,1)),umc_size*size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(umc_dev, loc(umc(1,1)),umc_size*size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda memcpy umc_dev failed ", istat
stop
endif
endif
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
if (lce0) then
allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating tmp "//errorMessage
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_allreduce(umc, tmp, l_cols*n_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
call mpi_allreduce(umc, tmp, l_cols*n_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif
umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
deallocate(tmp, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when deallocating tmp "//errorMessage
stop
endif
endif
#else /* WITH_MPI */
if (l_cols>0) then
allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating tmp "//errorMessage
stop
endif
tmp(1:l_cols,1:n_cols) = umc(1:l_cols,1:n_cols)
umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
deallocate(tmp, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when deallocating tmp "//errorMessage
stop
endif
endif
#endif /* WITH_MPI */
! U = U * Tmat**T
if (useGPU) then
if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then
print *,"bandred_complex: umc size 4 :",size(umc,dim=1)*size(umc,dim=2),umc_size
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(umc_dev, loc(umc(1,1)),umc_size*size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(umc_dev, loc(umc(1,1)),umc_size*size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *, "bandred_complex: cuad memcpy failed umc_dev ", istat
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *, "bandred_complex: cuad memcpy failed tmat_dev ", istat
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
call cublas_ztrmm('Right', 'Upper', 'C', 'Nonunit', l_cols, n_cols, CONE, tmat_dev, nbw, umc_dev, cur_l_cols)
#else
call cublas_ctrmm('Right', 'Upper', 'C', 'Nonunit', l_cols, n_cols, CONE, tmat_dev, nbw, umc_dev, cur_l_cols)
#endif
else ! not useGPU
#ifdef DOUBLE_PRECISION_COMPLEX
call ztrmm('Right', 'Upper', 'C', 'Nonunit', l_cols, n_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), &
umc, ubound(umc,dim=1))
#else
call ctrmm('Right', 'Upper', 'C', 'Nonunit', l_cols, n_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), &
umc, ubound(umc,dim=1))
#endif
endif
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
if (useGPU) then
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *, "bandred_complex: cuad memcpy failed vav_dev ", istat
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
call cublas_zgemm('C', 'N', n_cols, n_cols, l_cols, CONE, umc_dev, cur_l_cols, (umc_dev +( cur_l_cols *n_cols) &
*size_of_double_complex_datatype ), cur_l_cols, CZERO, vav_dev, nbw)
call cublas_ztrmm('Right', 'Upper', 'C', 'Nonunit', n_cols, n_cols, CONE, tmat_dev, nbw, vav_dev, nbw)
#else
call cublas_cgemm('C', 'N', n_cols, n_cols, l_cols, CONE, umc_dev, cur_l_cols, (umc_dev +( cur_l_cols *n_cols) &
*size_of_single_complex_datatype ), cur_l_cols, CZERO, vav_dev, nbw)
call cublas_ctrmm('Right', 'Upper', 'C', 'Nonunit', n_cols, n_cols, CONE, tmat_dev, nbw, vav_dev, nbw)
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev,nbw*nbw*size_of_double_complex_datatype,cudaMemcpyDeviceToHost)
#else
successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev,nbw*nbw*size_of_single_complex_datatype,cudaMemcpyDeviceToHost)
#endif
if (.not.(successCUDA)) then
print *, "bandred_complex: cuad memcpy failed vav ", istat
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
call herm_matrix_allreduce_double(n_cols,vav, nbw, nbw,mpi_comm_cols)
#else
call herm_matrix_allreduce_single(n_cols,vav, nbw, nbw,mpi_comm_cols)
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)),nbw*nbw*size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)),nbw*nbw*size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *, "bandred_complex: cuad memcpy failed vav_dev ", istat
stop
endif
else
#ifdef DOUBLE_PRECISION_COMPLEX
call zgemm('C', 'N', n_cols, n_cols, l_cols, CONE, umc, ubound(umc,dim=1), umc(1,n_cols+1), &
ubound(umc,dim=1), CZERO, vav, ubound(vav,dim=1))
call ztrmm('Right', 'Upper', 'C', 'Nonunit', n_cols, n_cols, CONE, tmat(1,1,istep), &
ubound(tmat,dim=1), vav, ubound(vav,dim=1))
#else
call cgemm('C', 'N', n_cols, n_cols, l_cols, CONE, umc, ubound(umc,dim=1), umc(1,n_cols+1), &
ubound(umc,dim=1), CZERO, vav, ubound(vav,dim=1))
call ctrmm('Right', 'Upper', 'C', 'Nonunit', n_cols, n_cols, CONE, tmat(1,1,istep), &
ubound(tmat,dim=1), vav, ubound(vav,dim=1))
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
call herm_matrix_allreduce_double(n_cols,vav,nbw,nbw,mpi_comm_cols)
#else
call herm_matrix_allreduce_single(n_cols,vav,nbw,nbw,mpi_comm_cols)
#endif
endif
! U = U - 0.5 * V * VAV
if (useGPU) then
#ifdef DOUBLE_PRECISION_COMPLEX
call cublas_zgemm('N', 'N', l_cols, n_cols, n_cols, (-0.5_rk8, 0.0_rk8), (umc_dev + &
(cur_l_cols * n_cols )*size_of_double_complex_datatype), &
cur_l_cols, vav_dev, nbw, CONE, umc_dev, cur_l_cols)
#else
call cublas_cgemm('N', 'N', l_cols, n_cols, n_cols, (-0.5_rk4, 0.0_rk4), (umc_dev + &
(cur_l_cols * n_cols )*size_of_single_complex_datatype), &
cur_l_cols, vav_dev, nbw, CONE, umc_dev, cur_l_cols)
#endif
! Transpose umc -> umr (stored in vmr, second half)
if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then
print *,"bandred_complex: umc size 5 :",size(umc,dim=1)*size(umc,dim=2),umc_size
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(loc(umc(1,1)),umc_dev,umc_size*size_of_double_complex_datatype,cudaMemcpyDeviceToHost)
#else
successCUDA = cuda_memcpy(loc(umc(1,1)),umc_dev,umc_size*size_of_single_complex_datatype,cudaMemcpyDeviceToHost)
#endif
if (.not.(successCUDA)) then
print *, "bandred_complex: cuad memcpy failed umc ", istat
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
call elpa_transpose_vectors_complex_double (umc, ubound(umc,dim=1), mpi_comm_cols, &
vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
#else
call elpa_transpose_vectors_complex_single (umc, ubound(umc,dim=1), mpi_comm_cols, &
vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
#endif
if (size(vmr,dim=1)*size(vmr,dim=2) .gt. vmr_size) then
print *,"bandred_complex: vmr size 4 :",size(vmr,dim=1)*size(vmr,dim=2),vmr_size
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(vmr_dev,loc(vmr(1,1)),vmr_size*size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(vmr_dev,loc(vmr(1,1)),vmr_size*size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda memcpy failed vav_dev", istat
stop
endif
if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then
print *,"bandred_complex: umc size 6 :",size(umc,dim=1)*size(umc,dim=2),umc_size
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(umc_dev,loc(umc(1,1)),umc_size*size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(umc_dev,loc(umc(1,1)),umc_size*size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda memcpy failed umc_dev ", istat
stop
endif
else ! not useGPU
#ifdef DOUBLE_PRECISION_COMPLEX
call zgemm('N', 'N', l_cols, n_cols, n_cols, (-0.5_rk8, 0.0_rk8), umc(1,n_cols+1), ubound(umc,dim=1), &
vav, ubound(vav,dim=1), CONE, umc, ubound(umc,dim=1))
#else
call cgemm('N', 'N', l_cols, n_cols, n_cols, (-0.5_rk4, 0.0_rk4), umc(1,n_cols+1), ubound(umc,dim=1), &
vav, ubound(vav,dim=1), CONE, umc, ubound(umc,dim=1))
#endif
! Transpose umc -> umr (stored in vmr, second half)
#ifdef DOUBLE_PRECISION_COMPLEX
call elpa_transpose_vectors_complex_double (umc, ubound(umc,dim=1), mpi_comm_cols, &
vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
#else
call elpa_transpose_vectors_complex_single (umc, ubound(umc,dim=1), mpi_comm_cols, &
vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
#endif
endif
! A = A - V*U**T - U*V**T
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
lre = min(l_rows,(i+1)*l_rows_tile)
if (lce 0) then
if (useGPU) then
#ifdef DOUBLE_PRECISION_COMPLEX
call cublas_zgemm('C', 'N', n_cols, l_cols, l_rows, CONE, hvm_dev, max_local_rows, &
q_dev, ldq, CZERO, tmp_dev, n_cols)
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, n_cols*l_cols*size_of_double_complex_datatype, &
cudaMemcpyDeviceToHost)
#else
call cublas_cgemm('C', 'N', n_cols, l_cols, l_rows, CONE, hvm_dev, max_local_rows, &
q_dev, ldq, CZERO, tmp_dev, n_cols)
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, n_cols*l_cols*size_of_single_complex_datatype, &
cudaMemcpyDeviceToHost)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
endif
else
#ifdef DOUBLE_PRECISION_COMPLEX
call zgemm('C', 'N', n_cols, l_cols, l_rows, CONE, hvm, ubound(hvm,dim=1), &
q, ldq, CZERO, tmp1, n_cols)
#else
call cgemm('C', 'N', n_cols, l_cols, l_rows, CONE, hvm, ubound(hvm,dim=1), &
q, ldq, CZERO, tmp1, n_cols)
#endif
endif
else ! l_rows > 0
if (useGPU) then
if (l_cols*n_cols .gt. (max_local_cols)*(nbw)) then
print *,"trans_ev_band_to_full_complex: tmp_dev ",l_cols*n_cols,max_local_cols
stop
endif
! istat = cuda_memset(tmp_dev, 0, l_cols*n_cols*size_of_complex_datatype)
! if (istat .ne. 0) then
! print *,"trans_ev_band_to_full_complex: error in cudaMemset"
! stop
! endif
endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp1(1:l_cols*n_cols) = 0._ck8
#else
tmp1(1:l_cols*n_cols) = 0._ck4
#endif
endif
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif
#else /* WITH_MPI */
tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols)
#endif /* WITH_MPI */
if (l_rows>0) then
if (useGPU) then
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(tmp_dev,loc(tmp2),l_cols*n_cols*size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(tmp_dev,loc(tmp2),l_cols*n_cols*size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
endif
! tmat_temp(1:nbw,1:nbw) = tmat(1:nbw,1:nbw,istep)
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)),nbw*nbw* &
size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)),nbw*nbw* &
size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
call cublas_ztrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat_dev, nbw, tmp_dev, n_cols)
call cublas_zgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm_dev, max_local_rows, &
tmp_dev, n_cols, CONE, q_dev, ldq)
#else
call cublas_ctrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat_dev, nbw, tmp_dev, n_cols)
call cublas_cgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm_dev, max_local_rows, &
tmp_dev, n_cols, CONE, q_dev, ldq)
#endif
else ! not useGPU
#ifdef DOUBLE_PRECISION_COMPLEX
call ztrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp2, n_cols)
call zgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
tmp2, n_cols, CONE, q, ldq)
#else
call ctrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp2, n_cols)
call cgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
tmp2, n_cols, CONE, q, ldq)
#endif
endif
endif
!#ifdef WITH_GPU_VERSION
! istat =cuda_memcpy(loc(hvm(1,1)),hvm_dev,((max_local_rows)*nbw*size_of_complex_datatype),cudaMemcpyDeviceToHost)
! if (istat .ne. 0) then
! print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
! stop
! endif
!#endif
enddo
deallocate(tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_band_to_full_complex: error when deallocating tmp1, tmp2, hvb, hvm "//errorMessage
stop
endif
if (useGPU) then
successCUDA = cuda_free(hvm_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaFree"
stop
endif
successCUDA = cuda_free(tmp_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaFree"
stop
endif
successCUDA = cuda_free(tmat_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaFree"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(loc(q), q_dev,ldq*matrixCols*size_of_double_complex_datatype, cudaMemcpyDeviceToHost)
#else
successCUDA = cuda_memcpy(loc(q), q_dev,ldq*matrixCols*size_of_single_complex_datatype, cudaMemcpyDeviceToHost)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
endif
! q(1:ldq,1:na_cols) = q_temp(1:ldq,1:na_cols)
successCUDA = cuda_free(q_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaFree"
stop
endif
! deallocate(q_temp, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"trans_ev_band_to_full_complex: error when deallocating q_temp "//errorMessage
! endif
!deallocate(tmat_temp, stat=istat, errmsg=errorMessage)
!if (istat .ne. 0) then
!print *,"trans_ev_band_to_full_complex: error when deallocating tmat_temp "//errorMessage
!endif
endif ! use GPU
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("trans_ev_band_to_full_complex_double")
#else
call timer%stop("trans_ev_band_to_full_complex_single")
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
end subroutine trans_ev_band_to_full_complex_double
#else
end subroutine trans_ev_band_to_full_complex_single
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine tridiag_band_complex_double(na, nb, nblk, a, lda, d, e, matrixCols, hh_trans_complex, &
mpi_comm_rows, mpi_comm_cols, mpi_comm)
#else
subroutine tridiag_band_complex_single(na, nb, nblk, a, lda, d, e, matrixCols, hh_trans_complex, &
mpi_comm_rows, mpi_comm_cols, mpi_comm)
#endif
!-------------------------------------------------------------------------------
! tridiag_band_complex:
! Reduces a complex hermitian symmetric band matrix to tridiagonal form
!
! na Order of matrix a
!
! nb Semi bandwith
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! a(lda,matrixCols) Distributed system matrix reduced to banded form in the upper diagonal
!
! lda Leading dimension of a
! matrixCols local columns of matrix a
!
! d(na) Diagonal of tridiagonal matrix, set only on PE 0 (output)
!
! e(na) Subdiagonal of tridiagonal matrix, set only on PE 0 (output)
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
! mpi_comm
! MPI-Communicator for the total processor set
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
!#ifdef WITH_GPU_VERSION
! integer(C_SIZE_T) :: h_dev, hv_new_dev ,ab_dev,x_dev,hs_dev,tau_new_dev,hv_dev,hd_dev
! complex*16, allocatable :: ab_temp(:,:)
!#endif
integer(kind=ik), intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE),intent(in) :: a(lda,*)
#else
complex(kind=COMPLEX_DATATYPE), intent(in) :: a(lda,matrixCols)
#endif
real(kind=REAL_DATATYPE), intent(out) :: d(na), e(na) ! set only on PE 0
complex(kind=COMPLEX_DATATYPE), intent(inout), &
allocatable :: hh_trans_complex(:,:)
real(kind=REAL_DATATYPE) :: vnorm2
complex(kind=COMPLEX_DATATYPE) :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
complex(kind=COMPLEX_DATATYPE) :: hd(nb), hs(nb)
integer(kind=ik) :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
integer(kind=ik) :: my_pe, n_pes, mpierr
integer(kind=ik) :: my_prow, np_rows, my_pcol, np_cols
integer(kind=ik) :: ireq_ab, ireq_hv
integer(kind=ik) :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
#ifdef WITH_OPENMP
integer(kind=ik), allocatable :: mpi_statuses(:,:)
integer(kind=ik), allocatable :: omp_block_limits(:)
integer(kind=ik) :: max_threads, my_thread, my_block_s, my_block_e, iter
integer(kind=ik) :: omp_get_max_threads
#ifdef WITH_MPI
integer(kind=ik) :: mpi_status(MPI_STATUS_SIZE)
#endif
complex(kind=COMPLEX_DATATYPE), allocatable :: hv_t(:,:), tau_t(:)
#endif
integer(kind=ik), allocatable :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:)
integer(kind=ik), allocatable :: limits(:), snd_limits(:,:)
integer(kind=ik), allocatable :: block_limits(:)
complex(kind=COMPLEX_DATATYPE), allocatable :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
integer(kind=ik) :: istat
character(200) :: errorMessage
#ifndef WITH_MPI
integer(kind=ik) :: startAddr
#endif
! ! dummies for calling redist_band
! real*8 :: r_a(1,1), r_ab(1,1)
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("tridiag_band_complex_double")
#else
call timer%start("tridiag_band_complex_single")
#endif
#endif
call mpi_comm_rank(mpi_comm,my_pe,mpierr)
call mpi_comm_size(mpi_comm,n_pes,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
!#ifdef WITH_GPU_VERSION
! t_1 = 0
! t_2 = 0
!#endif
! Get global_id mapping 2D procssor coordinates to global id
allocate(global_id(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating global_id "//errorMessage
stop
endif
global_id(:,:) = 0
global_id(my_prow, my_pcol) = my_pe
#ifdef WITH_MPI
call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr)
#endif
! Total number of blocks in the band:
nblocks_total = (na-1)/nb + 1
! Set work distribution
allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating block_limits "//errorMessage
stop
endif
call divide_band(nblocks_total, n_pes, block_limits)
! nblocks: the number of blocks for my task
nblocks = block_limits(my_pe+1) - block_limits(my_pe)
! allocate the part of the band matrix which is needed by this PE
! The size is 1 block larger than needed to avoid extensive shifts
allocate(ab(2*nb,(nblocks+1)*nb), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating ab "//errorMessage
stop
endif
!#ifdef WITH_GPU_VERSION
! allocate(ab_temp(2*nb,nblocks*nb), stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"error when allocating ab_temp "//errorMessage
! stop
! endif
!#endif
ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety
!#ifdef WITH_GPU_VERSION
!
! istat = cuda_malloc(ab_dev, 2*nb*(nblocks+1)*nb*size_of_complex_datatype)
! if (istat .ne. 0) print *, " cuda malloc failed ab_dev", istat
!
! istat = cuda_malloc(hv_new_dev, nb*size_of_complex_datatype )
! if (istat .ne. 0) print *, " cuda malloc failed hv_new_dev", istat
!
!! istat = cuda_malloc(temp_c_dev, nb*nb*size_of_complex_datatype )
!! if(istat .ne. 0) print *, " cuda malloc failed temp_c", istat
!
! istat = cuda_malloc(h_dev , nb*size_of_complex_datatype)
! if (istat .ne. 0) print *, " cuda malloc failed h_dev", istat
!
! istat = cuda_malloc(hs_dev , nb*size_of_complex_datatype)
! if (istat .ne. 0) print *, " cuda malloc failed hs_dev", istat
!
! istat = cuda_malloc(x_dev , 1*size_of_complex_datatype)
! if (istat .ne. 0) print *, " cuda malloc failed x_dev", istat
!
! istat = cuda_malloc( tau_new_dev , 1*size_of_complex_datatype)
! if (istat .ne. 0) print *, " cuda malloc failed tau_new_dev", istat
!
! istat = cuda_malloc(hv_dev , nb*size_of_complex_datatype)
! if (istat .ne. 0) print *, " cuda malloc failed hv_dev", istat
!
! istat = cuda_malloc(hd_dev , nb*size_of_complex_datatype)
! if (istat .ne. 0) print *, " cuda malloc failed hd_dev", istat
!#endif
! n_off: Offset of ab within band
n_off = block_limits(my_pe)*nb
! Redistribute band in a to ab
#ifdef DOUBLE_PRECISION_COMPLEX
call redist_band_complex_double(a, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, ab)
#else
call redist_band_complex_single(a, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, ab)
#endif
! Calculate the workload for each sweep in the back transformation
! and the space requirements to hold the HH vectors
allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating limits "//errorMessage
stop
endif
call determine_workload(na, nb, np_rows, limits)
max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
num_hh_vecs = 0
num_chunks = 0
nx = na
do n = 1, nblocks_total
call determine_workload(nx, nb, np_rows, limits)
local_size = limits(my_prow+1) - limits(my_prow)
! add to number of householder vectors
! please note: for nx==1 the one and only HH vector is 0 and is neither calculated nor send below!
if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
num_hh_vecs = num_hh_vecs + local_size
num_chunks = num_chunks+1
endif
nx = nx - nb
enddo
! Allocate space for HH vectors
allocate(hh_trans_complex(nb,num_hh_vecs), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating hh_trans_comples "//errorMessage
stop
endif
! Allocate and init MPI requests
allocate(ireq_hhr(num_chunks), stat=istat, errmsg=errorMessage) ! Recv requests
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating ireq_hhr "//errorMessage
stop
endif
allocate(ireq_hhs(nblocks), stat=istat, errmsg=errorMessage) ! Send requests
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating ireq_hhs "//errorMessage
stop
endif
num_hh_vecs = 0
num_chunks = 0
nx = na
nt = 0
do n = 1, nblocks_total
call determine_workload(nx, nb, np_rows, limits)
local_size = limits(my_prow+1) - limits(my_prow)
if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
num_chunks = num_chunks+1
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_irecv(hh_trans_complex(1,num_hh_vecs+1), nb*local_size, MPI_COMPLEX16, nt, &
10+n-block_limits(nt), mpi_comm, ireq_hhr(num_chunks), mpierr)
#else
call mpi_irecv(hh_trans_complex(1,num_hh_vecs+1), nb*local_size, MPI_COMPLEX8, nt, &
10+n-block_limits(nt), mpi_comm, ireq_hhr(num_chunks), mpierr)
#endif
#else /* WITH_MPI */
! carefull non-block recv data copy must be done at wait or send
! hh_trans_complex(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk)
#endif /* WITH_MPI */
num_hh_vecs = num_hh_vecs + local_size
endif
nx = nx - nb
if (n == block_limits(nt+1)) then
nt = nt + 1
endif
enddo
#ifdef WITH_MPI
ireq_hhs(:) = MPI_REQUEST_NULL
#endif
! Buffers for gathering/sending the HH vectors
allocate(hh_gath(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! gathers HH vectors
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating hh_gath "//errorMessage
stop
endif
allocate(hh_send(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! send buffer for HH vectors
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating hh_sebd "//errorMessage
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
hh_gath(:,:,:) = 0._ck8
hh_send(:,:,:) = 0._ck8
#else
hh_gath(:,:,:) = 0._ck4
hh_send(:,:,:) = 0._ck4
#endif
! Some counters
allocate(hh_cnt(nblocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating hh_cnt "//errorMessage
stop
endif
allocate(hh_dst(nblocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating hh_dst "//errorMessage
stop
endif
hh_cnt(:) = 1 ! The first transfomation vector is always 0 and not calculated at all
hh_dst(:) = 0 ! PE number for receive
#ifdef WITH_MPI
ireq_ab = MPI_REQUEST_NULL
ireq_hv = MPI_REQUEST_NULL
#endif
! Limits for sending
allocate(snd_limits(0:np_rows,nblocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating snd_limits "//errorMessage
stop
endif
do iblk=1,nblocks
call determine_workload(na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk))
enddo
#ifdef WITH_OPENMP
! OpenMP work distribution:
max_threads = 1
!$ max_threads = omp_get_max_threads()
! For OpenMP we need at least 2 blocks for every thread
max_threads = MIN(max_threads, nblocks/2)
if (max_threads==0) max_threads = 1
allocate(omp_block_limits(0:max_threads), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating omp_block_limits "//errorMessage
stop
endif
! Get the OpenMP block limits
call divide_band(nblocks, max_threads, omp_block_limits)
allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating hv_t, tau_t "//errorMessage
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
hv_t = 0._ck8
tau_t = 0._ck8
#else
hv_t = 0._ck4
tau_t = 0._ck4
#endif
#endif
! ---------------------------------------------------------------------------
! Start of calculations
na_s = block_limits(my_pe)*nb + 1
if (my_pe>0 .and. na_s<=na) then
! send first column to previous PE
! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_isend(ab_s, nb+1, MPI_COMPLEX16, my_pe-1, 1, mpi_comm, ireq_ab, mpierr)
#else
call mpi_isend(ab_s, nb+1, MPI_COMPLEX8, my_pe-1, 1, mpi_comm, ireq_ab, mpierr)
#endif
#endif /* WITH_MPI */
endif
#ifndef WITH_MPI
startAddr = ubound(hh_trans_complex,dim=2)
#endif
#ifdef WITH_OPENMP
do istep=1,na-1-block_limits(my_pe)*nb
#else
do istep=1,na-1
#endif
if (my_pe==0) then
n = MIN(na-na_s,nb) ! number of rows to be reduced
#ifdef DOUBLE_PRECISION_COMPLEX
hv(:) = 0._ck8
tau = 0._ck8
#else
hv(:) = 0._ck4
tau = 0._ck4
#endif
! Transform first column of remaining matrix
! Opposed to the real case, the last step (istep=na-1) is needed here for making
! the last subdiagonal element a real number
#ifdef DOUBLE_PRECISION_COMPLEX
vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk8)**2+dimag(ab(3:n+1,na_s-n_off))**2)
#else
vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk4)**2+aimag(ab(3:n+1,na_s-n_off))**2)
#endif
if (n<2) vnorm2 = 0. ! Safety only
#ifdef DOUBLE_PRECISION_COMPLEX
call hh_transform_complex_double(ab(2,na_s-n_off),vnorm2,hf,tau)
#else
call hh_transform_complex_single(ab(2,na_s-n_off),vnorm2,hf,tau)
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
hv(1) = 1._ck8
#else
hv(1) = 1._ck4
#endif
hv(2:n) = ab(3:n+1,na_s-n_off)*hf
d(istep) = ab(1,na_s-n_off)
e(istep) = ab(2,na_s-n_off)
if (istep == na-1) then
d(na) = ab(1,na_s+1-n_off)
#ifdef DOUBLE__PRECISION_COMPLEX
e(na) = 0._rk8
#else
e(na) = 0._rk4
#endif
endif
else
if (na>na_s) then
! Receive Householder vector from previous task, from PE owning subdiagonal
#ifdef WITH_OPENMP
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_recv(hv, nb, MPI_COMPLEX16, my_pe-1, 2, mpi_comm, mpi_status, mpierr)
#else
call mpi_recv(hv, nb, MPI_COMPLEX8, my_pe-1, 2, mpi_comm, mpi_status, mpierr)
#endif
#else /* WITH_MPI */
hv(1:nb) = hv_s(1:nb)
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_recv(hv, nb, MPI_COMPLEX16, my_pe-1, 2, mpi_comm, MPI_STATUS_IGNORE, mpierr)
#else
call mpi_recv(hv, nb, MPI_COMPLEX8, my_pe-1, 2, mpi_comm, MPI_STATUS_IGNORE, mpierr)
#endif
#else /* WITH_MPI */
hv(1:nb) = hv_s(1:nb)
#endif /* WITH_MPI */
#endif /* WITH_OPENMP */
tau = hv(1)
#ifdef DOUBLE_PRECISION_COMPLEX
hv(1) = 1._ck8
#else
hv(1) = 1._ck4
#endif
endif
endif
na_s = na_s+1
if (na_s-n_off > nb) then
!#ifdef WITH_GPU_VERSION
! ab_temp(:,1:nblocks*nb) = ab(:,nb+1:(nblocks +1)*nb)
! ab(:, 1:nblocks*nb) = ab_temp(:, 1:nblocks*nb)
!#else
ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
!#endif
ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0
n_off = n_off + nb
endif
#ifdef WITH_OPENMP
if (max_threads > 1) then
! Codepath for OpenMP
! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread!
! Every thread is one reduction cycle behind its predecessor and thus starts one step later.
! This simulates the behaviour of the MPI tasks which also work after each other.
! The code would be considerably easier, if the MPI communication would be made within
! the parallel region - this is avoided here since this would require
! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program.
hv_t(:,1) = hv
tau_t(1) = tau
do iter = 1, 2
! iter=1 : work on first block
! iter=2 : work on remaining blocks
! This is done in 2 iterations so that we have a barrier in between:
! After the first iteration, it is guaranteed that the last row of the last block
! is completed by the next thread.
! After the first iteration it is also the place to exchange the last row
! with MPI calls
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("OpenMP parallel_double")
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
!$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
!$omp& nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads)
do my_thread = 1, max_threads
if (iter == 1) then
my_block_s = omp_block_limits(my_thread-1) + 1
my_block_e = my_block_s
else
my_block_s = omp_block_limits(my_thread-1) + 2
my_block_e = omp_block_limits(my_thread)
endif
do iblk = my_block_s, my_block_e
ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block
ne = ns+nb-1 ! last column in block
if (istepna) exit
hv = hv_t(:,my_thread)
tau = tau_t(my_thread)
! Store Householder vector for back transformation
hh_cnt(iblk) = hh_cnt(iblk) + 1
hh_gath(1 ,hh_cnt(iblk),iblk) = tau
hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
! Note that nr>=0 implies that diagonal block is full (nc==nb)!
! Transform diagonal block
#ifdef DOUBLE_PRECISION_COMPLEX
call ZHEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, (0.0_rk8, 0.0_rk8), hd, 1)
#else
call CHEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, (0.0_rk4, 0.0_rk4), hd, 1)
#endif
x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau)
hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc)
#ifdef DOUBLE_PRECISION_COMPLEX
call ZHER2('L', nc, (-1.0_rk8, 0.0_rk8), hd, 1, hv, 1, ab(1,ns), 2*nb-1)
#else
call CHER2('L', nc, (-1.0_rk4, 0.0_rk4), hd, 1, hv, 1, ab(1,ns), 2*nb-1)
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
hv_t(:,my_thread) = 0._ck8
tau_t(my_thread) = 0._ck8
#else
hv_t(:,my_thread) = 0._ck4
tau_t(my_thread) = 0._ck4
#endif
if (nr<=0) cycle ! No subdiagonal block present any more
! Transform subdiagonal block
#ifdef DOUBLE_PRECISION_COMPLEX
call ZGEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, (0.0_rk8,0.0_rk8), hs, 1)
#else
call CGEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, (0.0_rk4,0.0_rk4), hs, 1)
#endif
if (nr>1) then
! complete (old) Householder transformation for first column
ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
! calculate new Householder transformation for first column
! (stored in hv_t(:,my_thread) and tau_t(my_thread))
#ifdef DOUBLE_PRECISION_COMPLEX
vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2)
#else
vnorm2 = sum(real(ab(nb+2:nb+nr,ns))**2+aimag(ab(nb+2:nb+nr,ns))**2)
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
call hh_transform_complex(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread))
#else
call hh_transform_complex_single(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread))
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
hv_t(1 ,my_thread) = 1._ck8
hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf
ab(nb+2:,ns) = 0._ck8
#else
hv_t(1 ,my_thread) = 1._ck4
hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf
ab(nb+2:,ns) = 0._ck4
#endif
! update subdiagonal block for old and new Householder transformation
! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster
#ifdef DOUBLE_PRECISION_COMPLEX
call ZGEMV('C', nr, nb-1, tau_t(my_thread), ab(nb,ns+1), 2*nb-1, hv_t(1,my_thread), 1, (0.0_rk8,0.0_rk8), h(2), 1)
#else
call CGEMV('C', nr, nb-1, tau_t(my_thread), ab(nb,ns+1), 2*nb-1, hv_t(1,my_thread), 1, (0.0_rk4,0.0_rk4), h(2), 1)
#endif
x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread)
h(2:nb) = h(2:nb) - x*hv(2:nb)
! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2")
do i=2,nb
ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) &
- hv_t(1:nr,my_thread)*conjg(h(i)) - hs(1:nr)*conjg(hv(i))
enddo
else
! No new Householder transformation for nr=1, just complete the old one
ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1
do i=2,nb
ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i))
enddo
! For safety: there is one remaining dummy transformation (but tau is 0 anyways)
#ifdef DOUBLE_PRECISION_COMPLEX
hv_t(1,my_thread) = 1._ck8
#else
hv_t(1,my_thread) = 1._ck4
#endif
endif
enddo
enddo ! my_thread
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("OpenMP parallel_double")
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
if (iter==1) then
! We are at the end of the first block
! Send our first column to previous PE
if (my_pe>0 .and. na_s <= na) then
#ifdef WITH_MPI
call mpi_wait(ireq_ab,mpi_status,mpierr)
#endif
ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_isend(ab_s, nb+1, MPI_COMPLEX16, my_pe-1, 1, mpi_comm, ireq_ab, mpierr)
#else
call mpi_isend(ab_s, nb+1, MPI_COMPLEX8, my_pe-1, 1, mpi_comm, ireq_ab, mpierr)
#endif
#endif /* WITH_MPI */
endif
! Request last column from next PE
ne = na_s + nblocks*nb - (max_threads-1) - 1
if (istep>=max_threads .and. ne <= na) then
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_recv(ab(1,ne-n_off), nb+1, MPI_COMPLEX16, my_pe+1, 1, mpi_comm, mpi_status, mpierr)
#else
call mpi_recv(ab(1,ne-n_off), nb+1, MPI_COMPLEX8, my_pe+1, 1, mpi_comm, mpi_status, mpierr)
#endif
#else /* WITH_MPI */
ab(1:nb+1,ne-n_off) = ab_s(1:nb+1)
#endif /* WITH_MPI */
endif
else
! We are at the end of all blocks
! Send last HH vector and TAU to next PE if it has been calculated above
ne = na_s + nblocks*nb - (max_threads-1) - 1
if (istep>=max_threads .and. ne < na) then
#ifdef WITH_MPI
call mpi_wait(ireq_hv,mpi_status,mpierr)
#endif
hv_s(1) = tau_t(max_threads)
hv_s(2:) = hv_t(2:,max_threads)
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_isend(hv_s, nb, MPI_COMPLEX16, my_pe+1, 2, mpi_comm, ireq_hv, mpierr)
#else
call mpi_isend(hv_s, nb, MPI_COMPLEX8, my_pe+1, 2, mpi_comm, ireq_hv, mpierr)
#endif
#endif /* WITH_MPI */
endif
! "Send" HH vector and TAU to next OpenMP thread
do my_thread = max_threads, 2, -1
hv_t(:,my_thread) = hv_t(:,my_thread-1)
tau_t(my_thread) = tau_t(my_thread-1)
enddo
endif
enddo ! iter
else
! Codepath for 1 thread without OpenMP
! The following code is structured in a way to keep waiting times for
! other PEs at a minimum, especially if there is only one block.
! For this reason, it requests the last column as late as possible
! and sends the Householder vector and the first column as early
! as possible.
#endif /* WITH_OPENMP */
!#ifdef WITH_GPU_VERSION
! call cpu_time(start)
!#endif
do iblk=1,nblocks
ns = na_s + (iblk-1)*nb - n_off ! first column in block
ne = ns+nb-1 ! last column in block
if (ns+n_off>na) exit
! Store Householder vector for back transformation
hh_cnt(iblk) = hh_cnt(iblk) + 1
hh_gath(1 ,hh_cnt(iblk),iblk) = tau
hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
#ifndef WITH_OPENMP
if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
! Wait for last transfer to finish
#ifdef WITH_MPI
call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
#endif
! Copy vectors into send buffer
hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
! Send to destination
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), MPI_COMPLEX16, &
global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1,np_cols)), &
10+iblk, mpi_comm, ireq_hhs(iblk), mpierr)
#else
call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), MPI_COMPLEX8, &
global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1,np_cols)), &
10+iblk, mpi_comm, ireq_hhs(iblk), mpierr)
#endif
#else /* WITH_MPI */
startAddr = startAddr - hh_cnt(iblk)
hh_trans_complex(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk)
#endif /* WITH_MPI */
! Reset counter and increase destination row
hh_cnt(iblk) = 0
hh_dst(iblk) = hh_dst(iblk)+1
endif
! The following code is structured in a way to keep waiting times for
! other PEs at a minimum, especially if there is only one block.
! For this reason, it requests the last column as late as possible
! and sends the Householder vector and the first column as early
! as possible.
#endif /* OpenMP */
nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
! Note that nr>=0 implies that diagonal block is full (nc==nb)!
! Multiply diagonal block and subdiagonal block with Householder vector
if (iblk==nblocks .and. nc==nb) then
! We need the last column from the next PE.
! First do the matrix multiplications without last column ...
! Diagonal block, the contribution of the last element is added below
#ifdef DOUBLE_PRECISION_COMPLEX
ab(1,ne) = 0._ck8
call ZHEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1,(0.0_rk8,0.0_rk8),hd,1)
! Subdiagonal block
if (nr>0) call ZGEMV('N', nr, nb-1, tau, ab(nb+1,ns), 2*nb-1, hv, 1,(0.0_rk8,0.0_rk8),hs,1)
! ... then request last column ...
! ... then request last column ...
#ifdef WITH_MPI
#ifdef WITH_OPENMP
call mpi_recv(ab(1,ne), nb+1, MPI_COMPLEX16, my_pe+1, 1, mpi_comm, mpi_status, mpierr)
#else
call mpi_recv(ab(1,ne), nb+1, MPI_COMPLEX16, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr)
#endif
#else /* WITH_MPI */
ab(1:nb+1,ne) = ab_s(1:nb+1)
#endif /* WITH_MPI */
! ... and complete the result
hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb)
hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau
else
! Normal matrix multiply
call ZHEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, (0.0_rk8,0.0_rk8), hd, 1)
if (nr>0) call ZGEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, (0.0_rk8,0.0_rk8), hs, 1)
endif
#else /* DOUBLE_PRECISION_COMPLEX */
ab(1,ne) = 0._ck4
call CHEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1,(0.0_rk4,0.0_rk4),hd,1)
! Subdiagonal block
if (nr>0) call CGEMV('N', nr, nb-1, tau, ab(nb+1,ns), 2*nb-1, hv, 1,(0.0_rk4,0.0_rk4),hs,1)
! ... then request last column ...
#ifdef WITH_MPI
#ifdef WITH_OPENMP
call mpi_recv(ab(1,ne), nb+1, MPI_COMPLEX8, my_pe+1, 1, mpi_comm, mpi_status, mpierr)
#else
call mpi_recv(ab(1,ne), nb+1, MPI_COMPLEX8, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr)
#endif
#else /* WITH_MPI */
ab(1:nb+1,ne) = ab_s(1:nb+1)
#endif /* WITH_MPI */
! ... and complete the result
hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb)
hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau
else
! Normal matrix multiply
call CHEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, (0.0_rk4,0.0_rk4), hd, 1)
if (nr>0) call CGEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, (0.0_rk4,0.0_rk4), hs, 1)
endif
#endif /* DOUBLE_PRECISION_COMPLEX */
! Calculate first column of subdiagonal block and calculate new
! Householder transformation for this column
hv_new(:) = 0 ! Needed, last rows must be 0 for nr < nb
!#ifdef WITH_GPU_VERSION
! istat = cuda_memset(hv_new_dev, 0,nb*size_of_complex_datatype )
! if (istat .ne. 0) print *, " cuda memset failed hv_new_dev", istat
!#endif
tau_new = 0
if (nr>0) then
! complete (old) Householder transformation for first column
ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
! calculate new Householder transformation ...
if (nr>1) then
#ifdef DOUBLE_PRECISION_COMPLEX
vnorm2 = sum(real(ab(nb+2:nb+nr,ns),kind=rk8)**2+dimag(ab(nb+2:nb+nr,ns))**2)
#else
vnorm2 = sum(real(ab(nb+2:nb+nr,ns),kind=rk4)**2+aimag(ab(nb+2:nb+nr,ns))**2)
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
call hh_transform_complex_double(ab(nb+1,ns),vnorm2,hf,tau_new)
#else
call hh_transform_complex_single(ab(nb+1,ns),vnorm2,hf,tau_new)
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
hv_new(1) = 1._ck8
hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf
ab(nb+2:,ns) = 0._ck8
#else
hv_new(1) = 1._ck4
hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf
ab(nb+2:,ns) = 0._ck4
#endif
endif
! ... and send it away immediatly if this is the last block
if (iblk==nblocks) then
#ifdef WITH_MPI
#ifdef WITH_OPENMP
call mpi_wait(ireq_hv,mpi_status,mpierr)
#else
call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
#endif
#endif /* WITH_MPI */
hv_s(1) = tau_new
hv_s(2:) = hv_new(2:)
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_isend(hv_s, nb, MPI_COMPLEX16, my_pe+1, 2 ,mpi_comm, ireq_hv, mpierr)
#else
call mpi_isend(hv_s, nb, MPI_COMPLEX8, my_pe+1, 2 ,mpi_comm, ireq_hv, mpierr)
#endif
#endif /* WITH_MPI */
endif
endif
! Transform diagonal block
x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau)
#ifdef DOUBLE_PRECISION_COMPLEX
hd(1:nc) = hd(1:nc) - 0.5_rk8*x*hv(1:nc)
#else
hd(1:nc) = hd(1:nc) - 0.5_rk4*x*hv(1:nc)
#endif
!#ifdef WITH_GPU_VERSION
! istat = cuda_memcpy2d((ab_dev + (ns-1)*2*nb*size_of_complex_datatype), 2*nb*size_of_complex_datatype,loc(a(1,ns)), 2*nb*size_of_complex_datatype, 2*size_of_complex_datatype , &
! 2*nb*size_of_complex_datatype,cudaMemcpyHostToDevice)
! if (istat .ne. 0) print *, "cuda memcpy a_dev H2D failed ", istat
! istat =cuda_memcpy(hv_dev,loc(hv),nc*size_of_complex_datatype,cudaMemcpyHostToDevice)
! if (istat .ne. 0) print *,"cuda memcpy failed hv_dev", istat
! istat =cuda_memcpy(hd_dev,loc(hd), nb*size_of_complex_datatype,cudaMemcpyHostToDevice)
! if (istat .ne. 0) print *,"cuda memcpy failed hd_dev", istat
!#endif
if (my_pe>0 .and. iblk==1) then
! The first column of the diagonal block has to be send to the previous PE
! Calculate first column only ...
!#ifdef WITH_GPU_VERSION
! call double_hh_transform_2( ns, nc, nb )
! istat=cuda_memcpy(loc(ab),ab_dev,(2*nb*(nblocks+1)*nb)*size_of_complex_datatype,cudaMemcpyDeviceToHost)
! if (istat .ne. 0) print *, " cuda memcpy failed ab ", istat
!#else
ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*conjg(hv(1)) - hv(1:nc)*conjg(hd(1))
!#endif
! ... send it away ...
#ifdef WITH_MPI
#ifdef WITH_OPENMP
call mpi_wait(ireq_ab,mpi_status,mpierr)
#else
call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
#endif
#endif /* WITH_MPI */
ab_s(1:nb+1) = ab(1:nb+1,ns)
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_isend(ab_s, nb+1, MPI_COMPLEX16, my_pe-1, 1, mpi_comm, ireq_ab, mpierr)
#else
call mpi_isend(ab_s, nb+1, MPI_COMPLEX8, my_pe-1, 1, mpi_comm, ireq_ab, mpierr)
#endif
#endif /* WITH_MPI */
! ... and calculate remaining columns with rank-2 update
if (nc>1) then
!#ifdef WITH_GPU_VERSION
! call cublas_ZHER2( 'L',nc -1,(-1.d0,0.d0), hd_dev + 1*16, 1, hv_dev +1*16, 1 , ab_dev + (ns*2*nb )*16, 2*nb-1)
!#else
#ifdef DOUBLE_PRECISION_COMPLEX
call ZHER2('L', nc-1, (-1.0_rk8,0.0_rk8), hd(2), 1, hv(2), 1, ab(1,ns+1), 2*nb-1)
#else
call CHER2('L', nc-1, (-1.0_rk4,0.0_rk4), hd(2), 1, hv(2), 1, ab(1,ns+1), 2*nb-1)
#endif
!#endif
endif
else
! No need to send, just a rank-2 update
!#ifdef WITH_GPU_VERSION
! call cublas_ZHER2( 'L',nc ,(-1.d0,0.d0), hd_dev, 1, hv_dev, 1 , ab_dev + ((ns-1)*2*nb )*16, 2*nb-1)
!#else
#ifdef DOUBLE_PRECISION_COMPLEX
call ZHER2('L', nc, (-1.0_rk8,0.0_rk8), hd, 1, hv, 1, ab(1,ns), 2*nb-1)
#else
call CHER2('L', nc, (-1.0_rk4,0.0_rk4), hd, 1, hv, 1, ab(1,ns), 2*nb-1)
#endif
!#endif
endif
!#ifdef WITH_GPU_VERSION
! istat=cuda_memcpy( loc(hd),hd_dev,nb*size_of_complex_datatype,cudaMemcpyDeviceToHost)
! if (istat .ne. 0) print *,"cuda memcpy failed hd_dev", istat
!#endif
! Do the remaining double Householder transformation on the subdiagonal block cols 2 ... nb
!#ifdef WITH_GPU_VERSION
! istat =cuda_memcpy(hs_dev,loc(hs),nb*size_of_complex_datatype,cudaMemcpyHostToDevice)
! if (istat .ne. 0) print *,"cuda memcpy failed hs_dev", istat
!#endif
if (nr>0) then
if (nr>1) then
!#ifdef WITH_GPU_VERSION
! istat = cuda_memcpy(hv_new_dev,loc(hv_new),nb*size_of_complex_datatype,cudaMemcpyHostToDevice)
! if (istat .ne. 0) print *,"cuda memcpy failed hv_new_dev", istat
!
! istat = cuda_memcpy(h_dev,loc(h),nb*size_of_complex_datatype,cudaMemcpyHostToDevice)
! if (istat .ne. 0) print *,"cuda memcpy failed h_dev", istat
!
! call cublas_ZGEMV('C',nr,nb-1,tau_new,ab_dev + (nb-1 + ns *2*nb)*16,2*nb-1,hv_new_dev,1,(0.d0,0.d0),h_dev + 1* 16,1)
!
! istat = cuda_memcpy(tau_new_dev,loc(tau_new),1*size_of_complex_datatype,cudaMemcpyHostToDevice)
! if (istat .ne. 0) print *,"cuda memcpy failed tau_new_dev", istat
!
! call dot_product_kernel(nr , tau_new)
! call dot_product_kernel_1( nb, nr , ns)
!
! istat = cuda_memcpy(loc(x),x_dev,1*size_of_complex_datatype,cudaMemcpyDeviceToHost)
! if (istat .ne. 0) print *, " cuda memcpy failed x_dev ", istat
!
! istat =cuda_memcpy(loc(h),h_dev,nb*size_of_complex_datatype,cudaMemcpyDeviceToHost)
! if (istat .ne. 0) print *, " cuda memcpy failed h ", istat
!#else
#ifdef DOUBLE_PRECISION_COMPLEX
call ZGEMV('C', nr, nb-1, tau_new, ab(nb,ns+1), 2*nb-1, hv_new, 1, (0.0_rk8, 0.0_rk8), h(2), 1)
#else
call CGEMV('C', nr, nb-1, tau_new, ab(nb,ns+1), 2*nb-1, hv_new, 1, (0.0_rk4, 0.0_rk4), h(2), 1)
#endif
x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new
h(2:nb) = h(2:nb) - x*hv(2:nb)
! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update
do i=2,nb
ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*conjg(h(i)) - hs(1:nr)*conjg(hv(i))
enddo
!#endif
else
! No double Householder transformation for nr=1, just complete the row
!#ifdef WITH_GPU_VERSION
! call double_hh_transform_1(nb, ns)
!#else
do i=2,nb
ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i))
enddo
!#endif
endif
endif
! Use new HH vector for the next block
hv(:) = hv_new(:)
tau = tau_new
enddo
!#ifdef WITH_GPU_VERSION
! call cpu_time(finish)
! tstep2 = finish-start
! t_2 = t_2 + tstep2
!#endif
#ifdef WITH_OPENMP
endif
#endif
#ifdef WITH_OPENMP
do iblk = 1, nblocks
if (hh_dst(iblk) >= np_rows) exit
if (snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit
if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
! Wait for last transfer to finish
#ifdef WITH_MPI
call mpi_wait(ireq_hhs(iblk), mpi_status, mpierr)
#endif
! Copy vectors into send buffer
hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
! Send to destination
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_complex16, &
global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1, np_cols)), &
10+iblk, mpi_comm, ireq_hhs(iblk), mpierr)
#else
call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_complex8, &
global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1, np_cols)), &
10+iblk, mpi_comm, ireq_hhs(iblk), mpierr)
#endif
#else /* WITH_MPI */
startAddr = startAddr - hh_cnt(iblk)
hh_trans_complex(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk)
#endif /* WITH_MPI */
! Reset counter and increase destination row
hh_cnt(iblk) = 0
hh_dst(iblk) = hh_dst(iblk)+1
endif
enddo
#endif /* WITH_OPENMP */
enddo
!#ifdef WITH_GPU_VERSION
! call cpu_time(finish_1)
! tstep1 = finish_1-start_1
! t_1 = t_1 + tstep1
!#endif
! Finish the last outstanding requests
#ifdef WITH_OPENMP
#ifdef WITH_MPI
call mpi_wait(ireq_ab,mpi_status,mpierr)
call mpi_wait(ireq_hv,mpi_status,mpierr)
allocate(mpi_statuses(MPI_STATUS_SIZE,max(nblocks,num_chunks)), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating mpi_statuses "//errorMessage
stop
endif
call mpi_waitall(nblocks, ireq_hhs, mpi_statuses, mpierr)
call mpi_waitall(num_chunks, ireq_hhr, mpi_statuses, mpierr)
deallocate(mpi_statuses, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when deallocating mpi_statuses "//errorMessage
stop
endif
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
#ifdef WITH_MPI
call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr)
call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr)
#endif
#endif /* WITH_OPENMP */
#ifdef WITH_MPI
call mpi_barrier(mpi_comm,mpierr)
#endif
deallocate(ab, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when deallocating ab "//errorMessage
stop
endif
!#ifdef WITH_GPU_VERSION
! deallocate(ab_temp, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"error when deallocating ab_temp "//errorMessage
! stop
! endif
!
!#endif
deallocate(ireq_hhr, ireq_hhs, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when deallocating ireq_hhr, ireq_hhs "//errorMessage
stop
endif
deallocate(hh_cnt, hh_dst, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when deallocating hh_cnt, hh_dst "//errorMessage
stop
endif
deallocate(hh_gath, hh_send, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when deallocating hh_gath, hh_send, "//errorMessage
stop
endif
deallocate(limits, snd_limits, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when deallocating limits, snd_limits "//errorMessage
stop
endif
deallocate(block_limits, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when deallocating block_limits, "//errorMessage
stop
endif
deallocate(global_id, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when deallocating global_id, "//errorMessage
stop
endif
!#ifdef WITH_GPU_VERSION
! istat = cuda_free(ab_dev)
! if (istat .ne. 0) then
! print *,"error in cudaFree"
! stop
! endif
!
! istat = cuda_free(hv_new_dev)
! if (istat .ne. 0) then
! print *,"error in cudaFree"
! stop
! endif
!
! istat = cuda_free(hs_dev)
! if (istat .ne. 0) then
! print *,"error in cudaFree"
! stop
! endif
!
! istat = cuda_free(h_dev)
! if (istat .ne. 0) then
! print *,"error in cudaFree"
! stop
! endif
!
! istat = cuda_free(tau_new_dev)
! if (istat .ne. 0) then
! print *,"error in cudaFree"
! stop
! endif
!
! istat = cuda_free(x_dev)
! if (istat .ne. 0) then
! print *,"error in cudaFree"
! stop
! endif
!
!#endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("tridiag_band_complex_double")
#else
call timer%stop("tridiag_band_complex_single")
#endif
#endif
!#ifdef WITH_GPU_VERSION
! contains
!
! subroutine dot_product_kernel(nr,tau_new)
! implicit none
! integer, intent(in) :: nr
! complex*16, intent(in) :: tau_new
!
! call launch_dot_product_kernel( hs_dev,hv_new_dev,tau_new,x_dev,h_dev,hv_dev, nr )
! end subroutine
!
! subroutine dot_product_kernel_1( nb , nr , ns)
! implicit none
! integer, intent(in) :: nb, nr, ns
!
! call launch_dot_product_kernel_1(ab_dev,hs_dev, hv_new_dev,x_dev,h_dev,hv_dev,nb , nr, ns)
! end subroutine
!
! subroutine double_hh_transform_1( nb , ns)
! implicit none
! integer, intent(in) :: nb, ns
!
! call launch_double_hh_transform_1(ab_dev,hs_dev,hv_dev,nb , ns)
! end subroutine
!
! subroutine double_hh_transform_2( ns,nc, nb)
! implicit none
! integer, intent(in) :: nc, ns, nb
!
! call launch_double_hh_transform_2(ab_dev,hd_dev,hv_dev,nc , ns, nb)
! end subroutine
!#endif
#ifdef DOUBLE_PRECISION_COMPLEX
end subroutine tridiag_band_complex_double ! has to be checked for GPU
#else
end subroutine tridiag_band_complex_single ! has to be checked for GPU
#endif
#define ATODEV istat = cuda_memcpy(loc(a), a_dev, stripe_width*a_dim2*stripe_count*size_of_complex_datatype, cudaMemcpyDeviceToHost)
#define ATOHOST istat = cuda_memcpy(a_dev, loc(a), stripe_width*a_dim2*stripe_count*size_of_complex_datatype, cudaMemcpyDeviceToHost)
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine trans_ev_tridi_to_band_complex_double(na, nev, nblk, nbw, q, ldq, matrixCols, &
hh_trans_complex, mpi_comm_rows, mpi_comm_cols, &
wantDebug, useGPU, success, THIS_COMPLEX_ELPA_KERNEL)
#else
subroutine trans_ev_tridi_to_band_complex_single(na, nev, nblk, nbw, q, ldq, matrixCols, &
hh_trans_complex, mpi_comm_rows, mpi_comm_cols, &
wantDebug, useGPU, success, THIS_COMPLEX_ELPA_KERNEL)
#endif
!-------------------------------------------------------------------------------
! trans_ev_tridi_to_band_complex:
! Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix
!
! Parameters
!
! na Order of matrix a, number of rows of matrix q
!
! nev Number eigenvectors to compute (= columns of matrix q)
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! nb semi bandwith
!
! q On input: Eigenvectors of tridiagonal matrix
! On output: Transformed eigenvectors
! Distribution is like in Scalapack.
!
! ldq Leading dimension of q
! matrixCols local columns of matrix q
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns/both
!
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use pack_unpack_complex
use compute_hh_trafo_complex
use precision
use cuda_functions
use iso_c_binding
implicit none
logical, intent(in) :: useGPU
integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL
integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE) :: q(ldq,*)
#else
complex(kind=COMPLEX_DATATYPE) :: q(ldq,matrixCols)
#endif
complex(kind=COMPLEX_DATATYPE) :: hh_trans_complex(:,:)
integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol
integer(kind=ik) :: tmp
integer(kind=ik) :: i, j, ip, sweep, nbuf, l_nev, a_dim2
integer(kind=ik) :: current_n, current_local_n, current_n_start, current_n_end
integer(kind=ik) :: next_n, next_local_n, next_n_start, next_n_end
integer(kind=ik) :: bottom_msg_length, top_msg_length, next_top_msg_length
integer(kind=ik) :: stripe_width, last_stripe_width, stripe_count
#ifdef WITH_OPENMP
integer(kind=ik) :: thread_width, csw, b_off, b_len
#endif
integer(kind=ik) :: num_result_blocks, num_result_buffers, num_bufs_recvd
integer(kind=ik) :: a_off, current_tv_off, max_blk_size
integer(kind=ik) :: mpierr, src, src_offset, dst, offset, nfact, num_blk
logical :: flag
#ifdef WITH_OPENMP
complex(kind=COMPLEX_DATATYPE), pointer :: a(:,:,:,:)
#else
complex(kind=COMPLEX_DATATYPE), pointer :: a(:,:,:)
#endif
complex(kind=COMPLEX_DATATYPE) :: a_complex
type(c_ptr) :: a_ptr
complex(kind=COMPLEX_DATATYPE), allocatable :: row(:)
complex(kind=COMPLEX_DATATYPE), allocatable :: row_group(:,:)
#ifdef WITH_OPENMP
complex(kind=COMPLEX_DATATYPE), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
complex(kind=COMPLEX_DATATYPE), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
#else
complex(kind=COMPLEX_DATATYPE), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
complex(kind=COMPLEX_DATATYPE), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
#endif
integer(kind=c_intptr_t) :: a_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev
integer(kind=c_size_t) :: num
integer(kind=c_size_t) :: dev_offset, dev_offset_1, dev_offset_2
integer(kind=c_intptr_t) :: row_dev
integer(kind=c_intptr_t) :: row_group_dev
integer(kind=c_intptr_t) :: hh_tau_dev
integer(kind=c_intptr_t) :: hh_dot_dev
integer(kind=ik) :: row_group_size, unpack_idx
integer(kind=ik) :: n_times
integer(kind=ik) :: top, chunk, this_chunk
complex(kind=COMPLEX_DATATYPE), allocatable :: result_buffer(:,:,:)
complex(kind=COMPLEX_DATATYPE), allocatable :: bcast_buffer(:,:)
integer(kind=ik) :: n_off
integer(kind=ik), allocatable :: result_send_request(:), result_recv_request(:), limits(:)
integer(kind=ik), allocatable :: top_send_request(:), bottom_send_request(:)
integer(kind=ik), allocatable :: top_recv_request(:), bottom_recv_request(:)
#ifdef WITH_OPENMP
integer(kind=ik), allocatable :: mpi_statuses(:,:)
#ifdef WITH_MPI
integer(kind=ik) :: mpi_status(MPI_STATUS_SIZE)
#endif
#endif
#ifdef WITH_MPI
integer(kind=ik), external :: numroc
#endif
integer(kind=ik) :: na_rows, na_cols
! real*8 :: ttt0, ttt1, ttt2, t2_compute_kernel, t0_compute_kernel,t1_compute_kernel, &
! t0_mpi_time, t1_mpi_time,t2_mpi_time
! real*8 :: t0_cpu_code,t1_cpu_code,t2_cpu_code,t0_block_time,t1_block_time,t2_block_time,t0_cuda_memcpy
! real*8 :: t0_inner_do_time, t1_inner_do_time , t2_inner_do_time,t0_outer_do_time ,t1_outer_do_time , &
! t2_outer_do_time ,t0_result_time ,t1_result_time, t2_result_time,t0_mpi_recv_time, &
! t1_mpi_recv_time,t2_mpi_recv_time
! real*8 :: t1_mpi_wait_time,t0_mpi_wait_time,t2_mpi_wait_time,t1_memcpy_time,t0_memcpy_time,t2_memcpy_time, &
! t1_mpi_irecv_time,t0_mpi_irecv_time,t2_mpi_irecv_time,t0_mpi_outer_wait_time,t1_mpi_outer_wait_time,&
! t2_mpi_outer_wait_time, time0
! real*4 :: time1
! MPI send/recv tags, arbitrary
integer(kind=ik), parameter :: bottom_recv_tag = 111
integer(kind=ik), parameter :: top_recv_tag = 222
integer(kind=ik), parameter :: result_recv_tag = 333
#ifdef WITH_OPENMP
integer(kind=ik) :: max_threads, my_thread
integer(kind=ik) :: omp_get_max_threads
#endif
! Just for measuring the kernel performance
real(kind=c_double) :: kernel_time ! MPI_WTIME always needs double
! long integer
integer(kind=lik) :: kernel_flops
logical, intent(in) :: wantDebug
integer(kind=ik) :: istat
character(200) :: errorMessage
logical :: successCUDA
logical :: success
#ifndef WITH_MPI
integer(kind=ik) :: j1
#endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("trans_ev_tridi_to_band_complex_double")
#else
call timer%start("trans_ev_tridi_to_band_complex_single")
#endif
#endif
if (useGPU) then
n_times =0
! n_times_1 =0
unpack_idx = 0
row_group_size = 0
! time0=0
! t0_compute_kernel=0
endif
kernel_time = 0.0
kernel_flops = 0
#ifdef WITH_OPENMP
max_threads = 1
max_threads = omp_get_max_threads()
#endif
call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr)
call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr)
call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr)
call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr)
if (useGPU) then
#ifdef WITH_MPI
na_rows = numroc(na, nblk, my_prow, 0, np_rows)
na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
na_rows = na
na_cols = na
#endif
endif
success = .true.
if (mod(nbw,nblk)/=0) then
if (my_prow==0 .and. my_pcol==0) then
if (wantDebug) then
write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: ERROR: nbw=',nbw,', nblk=',nblk
write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: band backtransform works only for nbw==n*nblk'
endif
success = .false.
return
endif
endif
nfact = nbw / nblk
! local number of eigenvectors
l_nev = local_index(nev, my_pcol, np_cols, nblk, -1)
if (l_nev==0) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
thread_width = 0
#endif
stripe_width = 0
stripe_count = 0
last_stripe_width = 0
else
! Suggested stripe width is 48 - should this be reduced for the complex case ???
#ifdef WITH_OPENMP
thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread
#endif
if (useGPU) then
stripe_width = 256
else
#ifdef DOUBLE_PRECISION_COMPLEX
stripe_width = 48 ! Must be a multiple of 2
#else
stripe_width = 48 ! Must be a multiple of 4
#endif
endif
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
stripe_count = (thread_width-1)/stripe_width + 1
#else /* WITH_OPENMP */
stripe_count = (l_nev-1)/stripe_width + 1
#endif /* WITH_OPENMP */
! Adapt stripe width so that last one doesn't get too small
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
stripe_width = (thread_width-1)/stripe_count + 1
#else /* WITH_OPENMP */
if (.not.(useGPU)) then
stripe_width = (l_nev-1)/stripe_count + 1
endif
#endif /* WITH_OPENMP */
if (.not.(useGPU)) then
#ifdef DOUBLE_PRECISION_COMPLEX
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes
! (2 * sizeof(double complex) == 32)
#else
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
! (4 * sizeof(float complex) == 32)
#endif
endif
#ifndef WITH_OPENMP
last_stripe_width = l_nev - (stripe_count-1)*stripe_width
#endif /* WITH_OPENMP */
endif
! Determine the matrix distribution at the beginning
allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error when allocating limits "//errorMessage
stop
endif
call determine_workload(na, nbw, np_rows, limits)
max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
a_dim2 = max_blk_size + nbw
!DEC$ ATTRIBUTES ALIGN: 64:: a
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
if (.not.(useGPU)) then
if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*max_threads*C_SIZEOF(a_complex)) /= 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating a "//errorMessage
stop
endif
call c_f_pointer(a_ptr, a, [stripe_width,a_dim2,stripe_count,max_threads] )
! allocate(a(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
! a(:,:,:,:) should be set to 0 in a parallel region, not here!
endif
#else /* OpenMP */
if (.not.(useGPU)) then
if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*C_SIZEOF(a_complex)) /= 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating a "//errorMessage
stop
endif
call c_f_pointer(a_ptr, a, [stripe_width,a_dim2,stripe_count] )
! allocate(a(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
a(:,:,:) = 0
endif
#endif /* WITH_OPENMP */
allocate(row(l_nev), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating row "//errorMessage
stop
endif
row(:) = 0
if (useGPU) then
#ifdef DOUBLE_PRECISION_COMPLEX
num = (stripe_width*a_dim2*stripe_count)*size_of_double_complex_datatype
if (na_rows * na_cols .lt. stripe_width*a_dim2*stripe_count) then
print *,"trans_ev_tridi_to_band_complex a_dev ",na_rows * na_cols, stripe_width*a_dim2*stripe_count
! stop
endif
successCUDA = cuda_malloc(a_dev, stripe_width*a_dim2*stripe_count*size_of_double_complex_datatype)
#else
num = (stripe_width*a_dim2*stripe_count)*size_of_single_complex_datatype
if (na_rows * na_cols .lt. stripe_width*a_dim2*stripe_count) then
print *,"trans_ev_tridi_to_band_complex a_dev ",na_rows * na_cols, stripe_width*a_dim2*stripe_count
! stop
endif
successCUDA = cuda_malloc(a_dev, stripe_width*a_dim2*stripe_count*size_of_single_complex_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc "
stop
endif
if (num .gt. na_rows * na_cols) then
print *,"trans_ev_tridi_to_band_complex a_dev 1",num, na_rows * na_cols
! stop
endif
successCUDA = cuda_memset(a_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemset "
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
num = (l_nev)*size_of_double_complex_datatype
#else
num = (l_nev)*size_of_single_complex_datatype
#endif
successCUDA = cuda_malloc( row_dev,num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc "
stop
endif
successCUDA = cuda_memset(row_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemset "
stop
endif
! "row_group" and "row_group_dev" are needed for GPU optimizations
allocate(row_group(l_nev, nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating row_group "//errorMessage
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
row_group(:, :) = 0._ck8
#else
row_group(:, :) = 0._ck4
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
num = (l_nev*nblk)*size_of_double_complex_datatype
#else
num = (l_nev*nblk)*size_of_single_complex_datatype
#endif
successCUDA = cuda_malloc(row_group_dev, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc "
stop
endif
successCUDA = cuda_memset(row_group_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemset "
stop
endif
endif ! useGPU
! Copy q from a block cyclic distribution into a distribution with contiguous rows,
! and transpose the matrix using stripes of given stripe_width for cache blocking.
! The peculiar way it is done below is due to the fact that the last row should be
! ready first since it is the first one to start below
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
! Please note about the OMP usage below:
! This is not for speed, but because we want the matrix a in the memory and
! in the cache of the correct thread (if possible)
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("OpenMP parallel_double")
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
#ifdef DOUBLE_PRECISION_COMPLEX
a(:,:,:,my_thread) = 0._ck8 ! if possible, do first touch allocation!
#else
a(:,:,:,my_thread) = 0._ck4 ! if possible, do first touch allocation!
#endif
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("OpenMP parallel_double")
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* WITH_OPENMP */
do ip = np_rows-1, 0, -1
if (my_prow == ip) then
! Receive my rows which have not yet been received
src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
do i=limits(ip)+1,limits(ip+1)
src = mod((i-1)/nblk, np_rows)
if (src < my_prow) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, mpi_status, mpierr)
#else
call MPI_Recv(row, l_nev, MPI_COMPLEX8, src, 0, mpi_comm_rows, mpi_status, mpierr)
#endif
#else /* WITH_MPI */
row(1:l_nev) = row(1:l_nev)
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
#ifdef DOUBLE_PRECISION_COMPLEX
if (useGPU) then
call unpack_and_prepare_row_group_complex_gpu_double(i - limits(ip), .false.)
#ifdef WITH_MPI
call MPI_Recv(row_group(:, row_group_size), l_nev,MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else
row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct?
#endif
else
#ifdef WITH_MPI
call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else
row(1:l_nev) = row(1:l_nev)
#endif
endif
#else /* DOUBLE_PRECISION_COMPLEX */
if (useGPU) then
call unpack_and_prepare_row_group_complex_gpu_single(i - limits(ip), .false.)
#ifdef WITH_MPI
call MPI_Recv(row_group(:, row_group_size), l_nev,MPI_COMPLEX8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else
row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct?
#endif
else
#ifdef WITH_MPI
call MPI_Recv(row, l_nev, MPI_COMPLEX8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else
row(1:l_nev) = row(1:l_nev)
#endif
endif
#endif /* DOUBLE_PRECISION_COMPLEX */
#endif /* WITH_OPENMP */
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("OpenMP parallel_double")
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
#ifdef DOUBLE_PRECISION_COMPLEX
call unpack_row_complex_cpu_openmp_double(a, row,i-limits(ip),my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
#else
call unpack_row_complex_cpu_openmp_single(a, row,i-limits(ip),my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
#endif
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("OpenMP parallel_double")
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#else /* WITH_OPENMP */
if (.not.(useGPU)) then
#ifdef DOUBLE_PRECISION_COMPLEX
call unpack_row_complex_cpu_double(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
#else
call unpack_row_complex_cpu_single(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
#endif
endif
#endif /* WITH_OPENMP */
elseif (src==my_prow) then
src_offset = src_offset+1
if (useGPU) then
#ifdef DOUBLE_PRECISION_COMPLEX
call unpack_and_prepare_row_group_complex_gpu_double(i - limits(ip),.false.)
#else
call unpack_and_prepare_row_group_complex_gpu_single(i - limits(ip),.false.)
#endif
row_group(:, row_group_size) = q(src_offset, 1:l_nev)
else
row(:) = q(src_offset, 1:l_nev)
endif
#ifdef WITH_OPENMP
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("OpenMP parallel_double")
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
#ifdef DOUBLE_PRECISION_COMPLEX
call unpack_row_complex_cpu_openmp_double(a, row,i-limits(ip),my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
#else
call unpack_row_complex_cpu_openmp_single(a, row,i-limits(ip),my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
#endif
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("OpenMP parallel_double")
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#else /* WITH_OPENMP */
if (.not.(useGPU)) then
#ifdef DOUBLE_PRECISION_COMPLEX
call unpack_row_complex_cpu_double(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
#else
call unpack_row_complex_cpu_single(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
#endif
endif
#endif /* WITH_OPENMP */
endif
enddo
! Send all rows which have not yet been send
src_offset = 0
do dst = 0, ip-1
do i=limits(dst)+1,limits(dst+1)
if(mod((i-1)/nblk, np_rows) == my_prow) then
src_offset = src_offset+1
row(:) = q(src_offset, 1:l_nev)
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Send(row, l_nev, MPI_COMPLEX16, dst, 0, mpi_comm_rows, mpierr)
#else
call MPI_Send(row, l_nev, MPI_COMPLEX8, dst, 0, mpi_comm_rows, mpierr)
#endif
#endif /* WITH_MPI */
endif
enddo
enddo
else if(my_prow < ip) then
! Send all rows going to PE ip
src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
do i=limits(ip)+1,limits(ip+1)
src = mod((i-1)/nblk, np_rows)
if (src == my_prow) then
src_offset = src_offset+1
row(:) = q(src_offset, 1:l_nev)
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Send(row, l_nev, MPI_COMPLEX16, ip, 0, mpi_comm_rows, mpierr)
#else
call MPI_Send(row, l_nev, MPI_COMPLEX8, ip, 0, mpi_comm_rows, mpierr)
#endif
#endif /* WITH_MPI */
endif
enddo
! Receive all rows from PE ip
do i=limits(my_prow)+1,limits(my_prow+1)
src = mod((i-1)/nblk, np_rows)
if (src == ip) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, mpi_status, mpierr)
#else
call MPI_Recv(row, l_nev, MPI_COMPLEX8, src, 0, mpi_comm_rows, mpi_status, mpierr)
#endif
#else /* WITH_MPI */
row(1:l_nev) = row(1:l_nev)
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
#ifdef DOUBLE_PRECISION_COMPLEX
if (useGPU) then
call unpack_and_prepare_row_group_complex_gpu_double(i - limits(my_prow), .false.)
#ifdef WITH_MPI
call MPI_Recv(row_group(:, row_group_size), l_nev,MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else
row_group(1:l_nev,row_group_size) = row(1:l_nev) ! is this correct ?
#endif
else
#ifdef WITH_MPI
call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else
row(1:l_nev) = row(1:l_nev)
#endif
endif
#else /* DOUBLE_PRECISION_COMPLEX */
if (useGPU) then
call unpack_and_prepare_row_group_complex_gpu_single(i - limits(my_prow), .false.)
#ifdef WITH_MPI
call MPI_Recv(row_group(:, row_group_size), l_nev,MPI_COMPLEX8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else
row_group(1:l_nev,row_group_size) = row_group(1:l_nev,row_group_size) ! is this correct
#endif
else
#ifdef WITH_MPI
call MPI_Recv(row, l_nev, MPI_COMPLEX8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else
row(1:l_nev) = row(1:l_nev)
#endif
endif
#endif /* DOUBLE_PRECISION_COMPLEX */
#endif /* WITH_OPENMP */
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("OpenMP parallel_double")
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
#ifdef DOUBLE_PRECISION_COMPLEX
call unpack_row_complex_cpu_openmp_double(a, row,i-limits(my_prow),my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
#else
call unpack_row_complex_cpu_openmp_single(a, row,i-limits(my_prow),my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
#endif
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("OpenMP parallel_double")
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#else /* WITH_OPENMP */
if (.not.(useGPU)) then
#ifdef DOUBLE_PRECISION_COMPLEX
call unpack_row_complex_cpu_double(a, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width)
#else
call unpack_row_complex_cpu_single(a, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width)
#endif
endif
#endif /* WITH_OPENMP */
endif
enddo
endif
enddo
if (useGPU) then
#ifdef DOUBLE_PRECISION_COMPLEX
call unpack_and_prepare_row_group_complex_gpu_double(-1, .true.)
#else
call unpack_and_prepare_row_group_complex_gpu_single(-1, .true.)
#endif
successCUDA = cuda_devicesynchronize()
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaDeviceSynchronize"
stop
endif
endif
! Set up result buffer queue
num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows
num_result_buffers = 4*nfact
allocate(result_buffer(l_nev,nblk,num_result_buffers), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating result_buffer "//errorMessage
stop
endif
allocate(result_send_request(num_result_buffers), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating result_send_request "//errorMessage
stop
endif
allocate(result_recv_request(num_result_buffers), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating result_recv_request "//errorMessage
stop
endif
#ifdef WITH_MPI
result_send_request(:) = MPI_REQUEST_NULL
result_recv_request(:) = MPI_REQUEST_NULL
#endif
! Queue up buffers
#ifdef WITH_MPI
if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
do j = 1, min(num_result_buffers, num_result_blocks)
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, MPI_COMPLEX16, 0, result_recv_tag, &
mpi_comm_rows, result_recv_request(j), mpierr)
#else
call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, MPI_COMPLEX8, 0, result_recv_tag, &
mpi_comm_rows, result_recv_request(j), mpierr)
#endif
enddo
endif
#else /* WITH_MPI */
! carefull the "recieve" has to be done at the corresponding wait or send
!if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
! do j = 1, min(num_result_buffers, num_result_blocks)
! result_buffer(1:l_nev*nblk,1,j) = result_buffer(1:l_nev*nblk,1,nbuf)
! enddo
!endif
#endif /* WITH_MPI */
num_bufs_recvd = 0 ! No buffers received yet
! Initialize top/bottom requests
allocate(top_send_request(stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating top_send_request "//errorMessage
stop
endif
allocate(top_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating top_recv_request "//errorMessage
stop
endif
allocate(bottom_send_request(stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating bottom_send_request "//errorMessage
stop
endif
allocate(bottom_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating bottom_recv_request "//errorMessage
stop
endif
#ifdef WITH_MPI
top_send_request(:) = MPI_REQUEST_NULL
top_recv_request(:) = MPI_REQUEST_NULL
bottom_send_request(:) = MPI_REQUEST_NULL
bottom_recv_request(:) = MPI_REQUEST_NULL
#endif
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating top_border_send_buffer "//errorMessage
stop
endif
allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating top_border_recv_buffer "//errorMessage
stop
endif
allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating bottom_border_send_buffer "//errorMessage
stop
endif
allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating bottom_border_recv_buffer "//errorMessage
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
top_border_send_buffer(:,:) = 0._ck8
top_border_recv_buffer(:,:) = 0._ck8
bottom_border_send_buffer(:,:) = 0._ck8
bottom_border_recv_buffer(:,:) = 0._ck8
#else
top_border_send_buffer(:,:) = 0._ck4
top_border_recv_buffer(:,:) = 0._ck4
bottom_border_send_buffer(:,:) = 0._ck4
bottom_border_recv_buffer(:,:) = 0._ck4
#endif
#else /* OpenMP */
allocate(top_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating top_border_send_buffer "//errorMessage
stop
endif
allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating top_border_recv_buffer "//errorMessage
stop
endif
allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating bottom_border_send_buffer "//errorMessage
stop
endif
allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating bottom_border_recv_buffer "//errorMessage
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
top_border_send_buffer(:,:,:) = 0._ck8
top_border_recv_buffer(:,:,:) = 0._ck8
bottom_border_send_buffer(:,:,:) = 0._ck8
bottom_border_recv_buffer(:,:,:) = 0._ck8
#else
top_border_send_buffer(:,:,:) = 0._ck4
top_border_recv_buffer(:,:,:) = 0._ck4
bottom_border_send_buffer(:,:,:) = 0._ck4
bottom_border_recv_buffer(:,:,:) = 0._ck4
#endif
#endif /* WITH_OPENMP */
! Initialize broadcast buffer
allocate(bcast_buffer(nbw, max_blk_size), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating bcast_buffer "//errorMessage
stop
endif
bcast_buffer = 0
if (useGPU) then
#ifdef DOUBLE_PRECISION_COMPLEX
num = ( nbw * max_blk_size) * size_of_double_complex_datatype
#else
num = ( nbw * max_blk_size) * size_of_single_complex_datatype
#endif
successCUDA = cuda_malloc(bcast_buffer_dev, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc"
stop
endif
successCUDA = cuda_memset( bcast_buffer_dev, 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemset"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
num = ((max_blk_size-1))*size_of_double_complex_datatype
#else
num = ((max_blk_size-1))*size_of_single_complex_datatype
#endif
successCUDA = cuda_malloc( hh_dot_dev, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc"
stop
endif
successCUDA = cuda_memset( hh_dot_dev, 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemset"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
num = (max_blk_size)*size_of_double_complex_datatype
#else
num = (max_blk_size)*size_of_single_complex_datatype
#endif
successCUDA = cuda_malloc( hh_tau_dev, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc"
stop
endif
successCUDA = cuda_memset( hh_tau_dev, 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemset"
stop
endif
endif ! useGPU
current_tv_off = 0 ! Offset of next row to be broadcast
! ------------------- start of work loop -------------------
a_off = 0 ! offset in A (to avoid unnecessary shifts)
top_msg_length = 0
bottom_msg_length = 0
#ifdef WITH_GPU_VERSION
!! istat = cuda_ProfilerStart()
!! istat = cudaFuncSetCacheConfig ( launch_compute_hh_trafo_c_kernel_complex, cudaFuncCachePreferShared)
!! t0_compute_kernel = 0
! t0_mpi_time = 0
! t0_cuda_memcpy =0
! t0_cpu_code =0
! t0_outer_do_time =0
! t0_inner_do_time =0
! t1_outer_do_time =MPI_Wtime()
! t0_block_time =0
! t0_mpi_wait_time = 0
! t0_memcpy_time = 0
! t0_mpi_outer_wait_time=0
#endif
do sweep = 0, (na-1)/nbw
#ifdef WITH_GPU_VERSION
! t1_cpu_code =MPI_Wtime()
#endif
current_n = na - sweep*nbw
call determine_workload(current_n, nbw, np_rows, limits)
current_n_start = limits(my_prow)
current_n_end = limits(my_prow+1)
current_local_n = current_n_end - current_n_start
next_n = max(current_n - nbw, 0)
call determine_workload(next_n, nbw, np_rows, limits)
next_n_start = limits(my_prow)
next_n_end = limits(my_prow+1)
next_local_n = next_n_end - next_n_start
if (next_n_end < next_n) then
bottom_msg_length = current_n_end - next_n_end
else
bottom_msg_length = 0
endif
if (next_local_n > 0) then
next_top_msg_length = current_n_start - next_n_start
else
next_top_msg_length = 0
endif
#ifdef WITH_GPU_VERSION
! t2_cpu_code =MPI_Wtime()
! t0_cpu_code = t0_cpu_code + (t2_cpu_code - t1_cpu_code)
#endif
if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
do i = 1, stripe_count
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
csw = min(stripe_width, thread_width-(i-1)*stripe_width) ! "current_stripe_width"
b_len = csw*nbw*max_threads
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, &
mpi_comm_rows, bottom_recv_request(i), mpierr)
#else
call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, MPI_COMPLEX8, my_prow+1, bottom_recv_tag, &
mpi_comm_rows, bottom_recv_request(i), mpierr)
#endif
#else /* WITH_MPI */
! carefull the "recieve" has to be do done at the corresponding wait or send
! bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, &
mpi_comm_rows, bottom_recv_request(i), mpierr)
#else
call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX8, my_prow+1, bottom_recv_tag, &
mpi_comm_rows, bottom_recv_request(i), mpierr)
#endif
#else /* WITH_MPI */
! carefull the "recieve" has to be do done at the corresponding wait or send
! bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i)
#endif /* WITH_MPI */
#endif /* WITH_OPENMP */
enddo
endif
#ifdef WITH_GPU_VERSION
! t1_block_time = MPI_Wtime()
#endif
if (current_local_n > 1) then
if (my_pcol == mod(sweep,np_cols)) then
bcast_buffer(:,1:current_local_n) = hh_trans_complex(:,current_tv_off+1:current_tv_off+current_local_n)
current_tv_off = current_tv_off + current_local_n
endif
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_bcast(bcast_buffer, nbw*current_local_n, MPI_COMPLEX16, mod(sweep,np_cols), mpi_comm_cols, mpierr)
#else
call mpi_bcast(bcast_buffer, nbw*current_local_n, MPI_COMPLEX8, mod(sweep,np_cols), mpi_comm_cols, mpierr)
#endif
#endif /* WITH_MPI */
if (useGPU) then
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(bcast_buffer_dev, loc(bcast_buffer(1,1)), nbw * &
current_local_n * size_of_double_complex_datatype , &
cudaMemcpyHostToDevice)
call extract_hh_tau_complex_gpu_double(nbw, current_local_n, .false.)
call compute_hh_dot_products_complex_gpu_double(nbw, current_local_n)
#else
successCUDA = cuda_memcpy(bcast_buffer_dev, loc(bcast_buffer(1,1)), nbw * &
current_local_n * size_of_single_complex_datatype , &
cudaMemcpyHostToDevice)
call extract_hh_tau_complex_gpu_single(nbw, current_local_n, .false.)
call compute_hh_dot_products_complex_gpu_single(nbw, current_local_n)
#endif
endif
else
! for current_local_n == 1 the one and only HH vector is 0 and not stored in hh_trans_complex
#ifdef DOUBLE_PRECISION_COMPLEX
bcast_buffer(:,1) = 0._ck8
#else
bcast_buffer(:,1) = 0._ck4
#endif
if (useGPU) then
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memset(bcast_buffer_dev, 0, nbw * size_of_double_complex_datatype)
#else
successCUDA = cuda_memset(bcast_buffer_dev, 0, nbw * size_of_single_complex_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemset"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
call extract_hh_tau_complex_gpu_double(nbw, 1, .true.)
#else
call extract_hh_tau_complex_gpu_single(nbw, 1, .true.)
#endif
!NOTE(ca): I commented out the following line
! istat = cuda_memcpy(loc(bcast_buffer(1,1)),bcast_buffer_dev,nbw*current_local_n * size_of_complex_datatype ,
! cudaMemcpyDeviceToHost)
! if (istat .ne. 0) then
! print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc"
! stop
! endif
endif ! useGPU
endif
#ifdef WITH_GPU_VERSION
! t2_block_time =MPI_Wtime()
! t0_block_time = t0_block_time + ( t2_block_time - t1_block_time)
#endif
if (l_nev == 0) cycle
if (current_local_n > 0) then
#ifdef WITH_GPU_VERSION
! t1_inner_do_time =MPI_Wtime()
#endif
do i = 1, stripe_count
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
! Get real stripe width for strip i;
! The last OpenMP tasks may have an even smaller stripe with,
! but we don't care about this, i.e. we send/recv a bit too much in this case.
! csw: current_stripe_width
csw = min(stripe_width, thread_width-(i-1)*stripe_width)
#endif /* WITH_OPENMP */
!wait_b
if (current_n_end < current_n) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef WITH_MPI
call MPI_Wait(bottom_recv_request(i), mpi_status, mpierr)
#endif
#else /* WITH_OPENMP */
#ifdef WITH_GPU_VERSION
! t1_mpi_wait_time =MPI_Wtime()
#endif
#ifdef WITH_MPI
call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr)
#endif
#ifdef WITH_GPU_VERSION
! t2_mpi_wait_time =MPI_Wtime()
! t0_mpi_wait_time = t0_mpi_wait_time + ( t2_mpi_wait_time - t1_mpi_wait_time)
!
! t1_memcpy_time =MPI_Wtime()
#endif
#endif /* WITH_OPENMP */
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("OpenMP parallel_double")
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
n_off = current_local_n+a_off
b_len = csw*nbw
b_off = (my_thread-1)*b_len
a(1:csw,n_off+1:n_off+nbw,i,my_thread) = &
reshape(bottom_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, nbw /))
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("OpenMP parallel_double")
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#else /* WITH_OPENMP */
n_off = current_local_n+a_off
if (useGPU) then
! t1_memcpy_time =MPI_Wtime()
#ifdef DOUBLE_PRECISION_COMPLEX
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * size_of_double_complex_datatype
successCUDA = cuda_memcpy( a_dev + dev_offset ,loc(bottom_border_recv_buffer(1,1,i)), &
stripe_width*nbw*size_of_double_complex_datatype ,cudaMemcpyHostToDevice)
#else
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * size_of_single_complex_datatype
successCUDA = cuda_memcpy( a_dev + dev_offset ,loc(bottom_border_recv_buffer(1,1,i)), &
stripe_width*nbw*size_of_single_complex_datatype ,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc"
stop
endif
! t2_memcpy_time =MPI_Wtime()
! t0_memcpy_time = t0_memcpy_time + ( t2_memcpy_time - t1_memcpy_time)
else
a(:,n_off+1:n_off+nbw,i) = bottom_border_recv_buffer(:,1:nbw,i)
endif
#endif /* WITH_OPENMP */
if (next_n_end < next_n) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"not yet implemented"
stop
endif
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, &
MPI_COMPLEX16, my_prow+1, bottom_recv_tag, &
mpi_comm_rows, bottom_recv_request(i), mpierr)
#else
call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, &
MPI_COMPLEX8, my_prow+1, bottom_recv_tag, &
mpi_comm_rows, bottom_recv_request(i), mpierr)
#endif
#else /* WITH_MPI */
! carefull the "recieve" has to be do done at the corresponding wait or send
! bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
#endif
#else /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, &
mpi_comm_rows, bottom_recv_request(i), mpierr)
#else
call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX8, my_prow+1, bottom_recv_tag, &
mpi_comm_rows, bottom_recv_request(i), mpierr)
#endif
#else /* WITH_MPI */
! carefull the "recieve" has to be do done at the corresponding wait or send
! bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i)
#endif /* WITH_MPI */
#endif /* WITH_OPENMP */
endif
endif
if (current_local_n <= bottom_msg_length + top_msg_length) then
!wait_t
if (top_msg_length>0) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef WITH_MPI
call MPI_Wait(top_recv_request(i), mpi_status, mpierr)
#endif
#else /* WITH_OPENMP */
#ifdef WITH_GPU_VERSION
! t1_mpi_wait_time =MPI_Wtime()
#endif
#ifdef WITH_MPI
call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
#endif
if (useGPU) then
! t2_mpi_wait_time =MPI_Wtime()
! t0_mpi_wait_time = t0_mpi_wait_time + ( t2_mpi_wait_time -t1_mpi_wait_time)
! t1_memcpy_time =MPI_Wtime()
!
#ifdef DOUBLE_PRECISION_COMPLEX
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * &
size_of_double_complex_datatype
#else
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * &
size_of_single_complex_datatype
#endif
! host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ))* 16
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy( a_dev+dev_offset ,loc(top_border_recv_buffer(1,1,i)), &
stripe_width*top_msg_length*size_of_double_complex_datatype , cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy( a_dev+dev_offset ,loc(top_border_recv_buffer(1,1,i)), &
stripe_width*top_msg_length*size_of_single_complex_datatype , cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy"
stop
endif
! t2_memcpy_time =MPI_Wtime()
! t0_memcpy_time = t0_memcpy_time + ( t2_memcpy_time - t1_memcpy_time)
else
a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
endif ! useGPU
#endif /* WITH_OPENMP */
endif
!compute
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("OpenMP parallel_double")
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
if (top_msg_length>0) then
b_len = csw*top_msg_length
b_off = (my_thread-1)*b_len
a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
endif
#ifdef DOUBLE_PRECISION_COMPLEX
call compute_hh_trafo_complex_cpu_openmp_double(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
0, current_local_n, i, my_thread, thread_width, &
THIS_COMPLEX_ELPA_KERNEL)
#else
call compute_hh_trafo_complex_cpu_openmp_single(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
0, current_local_n, i, my_thread, thread_width, &
THIS_COMPLEX_ELPA_KERNEL)
#endif
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("OpenMP parallel_double")
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#else /* WITH_OPENMP */
if (useGPU) then
#ifdef DOUBLE_PRECISION_COMPLEX
call compute_hh_trafo_complex_gpu_double(0, current_local_n, i, a_off, dev_offset, dev_offset_1, dev_offset_2)
! call compute_hh_trafo_complex_gpu_double(0, current_local_n, i)
#else
call compute_hh_trafo_complex_gpu_single(0, current_local_n, i, a_off, dev_offset, dev_offset_1, dev_offset_2)
! call compute_hh_trafo_complex_gpu_single(0, current_local_n, i)
#endif
else
#ifdef DOUBLE_PRECISION_COMPLEX
call compute_hh_trafo_complex_cpu_double(a, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
0, current_local_n, i, last_stripe_width, &
THIS_COMPLEX_ELPA_KERNEL)
#else
call compute_hh_trafo_complex_cpu_single(a, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
0, current_local_n, i, last_stripe_width, &
THIS_COMPLEX_ELPA_KERNEL)
#endif
endif
#endif /* WITH_OPENMP */
!send_b
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef WITH_MPI
call MPI_Wait(bottom_send_request(i), mpi_status, mpierr)
#endif
#else /* WITH_OPENMP */
#ifdef WITH_GPU_VERSION
! t1_mpi_wait_time =MPI_Wtime()
#endif
#ifdef WITH_MPI
call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
#endif
#ifdef WITH_GPU_VERSION
! t2_mpi_wait_time =MPI_Wtime()
! t0_mpi_wait_time = t0_mpi_wait_time + ( t2_mpi_wait_time-t1_mpi_wait_time)
#endif
#endif /* WITH_OPENMP */
if (bottom_msg_length>0) then
n_off = current_local_n+nbw-bottom_msg_length+a_off
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
b_len = csw*bottom_msg_length*max_threads
bottom_border_send_buffer(1:b_len,i) = &
reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, &
top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
#else
call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX8, my_prow+1, &
top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
#endif
#else /* WITH_MPI */
if (next_top_msg_length > 0) then
top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = &
bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i)
endif
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
if (useGPU) then
! t1_memcpy_time =MPI_Wtime()
#ifdef DOUBLE_PRECISION_COMPLEX
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * &
size_of_double_complex_datatype
successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), a_dev + dev_offset, &
stripe_width * bottom_msg_length * size_of_double_complex_datatype , &
cudaMemcpyDeviceToHost)
#else
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * &
size_of_single_complex_datatype
successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), a_dev + dev_offset, &
stripe_width * bottom_msg_length * size_of_single_complex_datatype , &
cudaMemcpyDeviceToHost)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy"
stop
endif
! t2_memcpy_time =MPI_Wtime()
! t0_memcpy_time = t0_memcpy_time + ( t2_memcpy_time -t1_memcpy_time)
else
bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i)
endif
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX16, my_prow+1, &
top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
#else
call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX16, my_prow+1, &
top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
#endif
#else /* WITH_MPI */
if (next_top_msg_length > 0) then
top_border_recv_buffer(1:next_top_msg_length*stripe_width,1,i) = &
bottom_border_send_buffer(1:bottom_msg_length*stripe_width,1,i)
endif
#endif /* WITH_MPI */
#endif /* WITH_OPENMP */
endif
else
!compute
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("OpenMP parallel_double")
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
#ifdef DOUBLE_PRECISION_COMPLEX
call compute_hh_trafo_complex_cpu_openmp_double(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, &
thread_width, THIS_COMPLEX_ELPA_KERNEL)
#else
call compute_hh_trafo_complex_cpu_openmp_single(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, &
thread_width, THIS_COMPLEX_ELPA_KERNEL)
#endif
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("OpenMP parallel_double")
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#else /* WITH_OPENMP */
#ifdef DOUBLE_PRECISION_COMPLEX
if (useGPU) then
call compute_hh_trafo_complex_gpu_double(current_local_n -bottom_msg_length, bottom_msg_length, i, a_off, &
dev_offset, dev_offset_1, dev_offset_2)
! call compute_hh_trafo_complex_gpu_double(current_local_n -bottom_msg_length, bottom_msg_length, i)
else
call compute_hh_trafo_complex_cpu_double(a, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
current_local_n - bottom_msg_length, bottom_msg_length, i, &
last_stripe_width, THIS_COMPLEX_ELPA_KERNEL)
endif
#else
if (useGPU) then
call compute_hh_trafo_complex_gpu_single(current_local_n -bottom_msg_length, bottom_msg_length, i, a_off, &
dev_offset, dev_offset_1, dev_offset_2)
! call compute_hh_trafo_complex_gpu_single(current_local_n -bottom_msg_length, bottom_msg_length, i)
else
call compute_hh_trafo_complex_cpu_single(a, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
current_local_n - bottom_msg_length, bottom_msg_length, i, &
last_stripe_width, THIS_COMPLEX_ELPA_KERNEL)
endif
#endif
#endif /* WITH_OPENMP */
!send_b
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef WITH_MPI
call MPI_Wait(bottom_send_request(i), mpi_status, mpierr)
#endif
#else /* WITH_OPENMP */
#ifdef WITH_GPU_VERSION
! t1_mpi_wait_time =MPI_Wtime()
#endif
#ifdef WITH_MPI
call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
#endif
#ifdef WITH_GPU_VERSION
! t2_mpi_wait_time =MPI_Wtime()
! t0_mpi_wait_time = t0_mpi_wait_time + ( t2_mpi_wait_time-t1_mpi_wait_time)
#endif
#endif /* WITH_OPENMP */
if (bottom_msg_length > 0) then
n_off = current_local_n+nbw-bottom_msg_length+a_off
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
b_len = csw*bottom_msg_length*max_threads
bottom_border_send_buffer(1:b_len,i) = &
reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, &
top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
#else
call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX8, my_prow+1, &
top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
#endif
#else /* WITH_MPI */
if (next_top_msg_length > 0) then
top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = &
bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i)
endif
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
if (useGPU) then
! t1_memcpy_time =MPI_Wtime()
#ifdef DOUBLE_PRECISION_COMPLEX
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * &
size_of_double_complex_datatype
successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), a_dev + dev_offset, &
stripe_width * bottom_msg_length * size_of_double_complex_datatype , &
cudaMemcpyDeviceToHost)
#else
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * &
size_of_single_complex_datatype
successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), a_dev + dev_offset, &
stripe_width * bottom_msg_length * size_of_single_complex_datatype , &
cudaMemcpyDeviceToHost)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy"
stop
endif
! t2_memcpy_time =MPI_Wtime()
! t0_memcpy_time = t0_memcpy_time + ( t2_memcpy_time -t1_memcpy_time)
else
bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i)
endif
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX16, my_prow+1, &
top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
#else
call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX8, my_prow+1, &
top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
#endif
#else /* WITH_MPI */
if (next_top_msg_length > 0) then
top_border_recv_buffer(1:next_top_msg_length*stripe_width,1,i) = &
bottom_border_send_buffer(1:bottom_msg_length*stripe_width,1,i)
endif
#endif /* WITH_MPI */
#endif /* WITH_OPENMP */
endif
!compute
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("OpenMP parallel_double")
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
#ifdef DOUBLE_PRECISION_COMPLEX
call compute_hh_trafo_complex_cpu_openmp_double(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, &
kernel_time, top_msg_length, &
current_local_n-top_msg_length-bottom_msg_length, i, &
my_thread, thread_width, THIS_COMPLEX_ELPA_KERNEL)
#else
call compute_hh_trafo_complex_cpu_openmp_single(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, &
kernel_time, top_msg_length, &
current_local_n-top_msg_length-bottom_msg_length, i, &
my_thread, thread_width, THIS_COMPLEX_ELPA_KERNEL)
#endif
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("OpenMP parallel_double")
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#else /* WITH_OPENMP */
#ifdef DOUBLE_PRECISION_COMPLEX
if (useGPU) then
! call compute_hh_trafo_complex_gpu_double(top_msg_length,current_local_n-top_msg_length-bottom_msg_length, i)
call compute_hh_trafo_complex_gpu_double(top_msg_length,current_local_n-top_msg_length-bottom_msg_length, &
i, a_off, &
dev_offset, dev_offset_1, dev_offset_2)
else
call compute_hh_trafo_complex_cpu_double(a, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, &
last_stripe_width, THIS_COMPLEX_ELPA_KERNEL)
#else
if (useGPU) then
! call compute_hh_trafo_complex_gpu_single(top_msg_length,current_local_n-top_msg_length-bottom_msg_length, i)
call compute_hh_trafo_complex_gpu_single(top_msg_length,current_local_n-top_msg_length-bottom_msg_length, &
i, a_off, &
dev_offset, dev_offset_1, dev_offset_2)
else
call compute_hh_trafo_complex_cpu_single(a, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, &
last_stripe_width, THIS_COMPLEX_ELPA_KERNEL)
#endif
endif
#endif /* WITH_OPENMP */
!wait_t
if (top_msg_length>0) then
#ifdef WITH_OPENMP
#ifdef WITH_MPI
call MPI_Wait(top_recv_request(i), mpi_status, mpierr)
#endif
#else /* WITH_OPENMP */
#ifdef WITH_GPU_VERSION
! t1_mpi_wait_time =MPI_Wtime()
#endif
#ifdef WITH_MPI
call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
#endif
if (useGPU) then
! t2_mpi_wait_time =MPI_Wtime()
! t0_mpi_wait_time = t0_mpi_wait_time +(t2_mpi_wait_time-t1_mpi_wait_time)
!
! t1_memcpy_time =MPI_Wtime()
#ifdef DOUBLE_PRECISION_COMPLEX
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * &
size_of_double_complex_datatype
successCUDA = cuda_memcpy( a_dev + dev_offset , loc(top_border_recv_buffer(:,1,i)), &
stripe_width * top_msg_length *size_of_double_complex_datatype , &
cudaMemcpyHostToDevice)
#else
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * &
size_of_single_complex_datatype
successCUDA = cuda_memcpy( a_dev + dev_offset , loc(top_border_recv_buffer(:,1,i)), &
stripe_width * top_msg_length *size_of_single_complex_datatype , &
cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy"
stop
endif
!
! t2_memcpy_time =MPI_Wtime()
! t0_memcpy_time = t0_memcpy_time + ( t2_memcpy_time-t1_memcpy_time)
else
a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
endif
#endif /* WITH_OPENMP */
endif
!compute
#ifdef WITH_OPENMP
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("OpenMP parallel_double")
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
if (top_msg_length>0) then
b_len = csw*top_msg_length
b_off = (my_thread-1)*b_len
a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
endif
#ifdef DOUBLE_PRECISION_COMPLEX
call compute_hh_trafo_complex_cpu_openmp_double(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
0, top_msg_length, i, my_thread, thread_width, &
THIS_COMPLEX_ELPA_KERNEL)
#else
call compute_hh_trafo_complex_cpu_openmp_single(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
0, top_msg_length, i, my_thread, thread_width, &
THIS_COMPLEX_ELPA_KERNEL)
#endif
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("OpenMP parallel_double")
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#else /* WITH_OPENMP */
#ifdef DOUBLE_PRECISION_COMPLEX
if (useGPU) then
call compute_hh_trafo_complex_gpu_double(0, top_msg_length, i, a_off, dev_offset, dev_offset_1, dev_offset_2)
! call compute_hh_trafo_complex_gpu_double(0, top_msg_length, i)
else
call compute_hh_trafo_complex_cpu_double(a, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
0, top_msg_length, i, last_stripe_width, &
THIS_COMPLEX_ELPA_KERNEL)
endif
#else
if (useGPU) then
call compute_hh_trafo_complex_gpu_single(0, top_msg_length, i, a_off, dev_offset, dev_offset_1, dev_offset_2)
! call compute_hh_trafo_complex_gpu_single(0, top_msg_length, i)
else
call compute_hh_trafo_complex_cpu_single(a, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
0, top_msg_length, i, last_stripe_width, &
THIS_COMPLEX_ELPA_KERNEL)
endif
#endif
#endif /* WITH_OPENMP */
endif
if (next_top_msg_length > 0) then
!request top_border data
#ifdef WITH_OPENMP
b_len = csw*next_top_msg_length*max_threads
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Irecv(top_border_recv_buffer(1,i), b_len, MPI_COMPLEX16, my_prow-1, &
top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
#else
call MPI_Irecv(top_border_recv_buffer(1,i), b_len, MPI_COMPLEX8, my_prow-1, &
top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
#endif
#else /* WITH_MPI */
! carefull the "recieve" has to be done at the corresponding send or wait
! top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i)
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, MPI_COMPLEX16, my_prow-1, &
top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
#else
call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, MPI_COMPLEX8, my_prow-1, &
top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
#endif
#else /* WITH_MPI */
! carefull the "recieve" has to be done at the corresponding send or wait
! top_border_recv_buffer(1:next_top_msg_length*stripe_width,1,i) = &
! bottom_border_send_buffer(1:bottom_msg_length*stripe_width,1,i)
#endif /* WITH_MPI */
#endif /* WITH_OPENMP */
endif
!send_t
if (my_prow > 0) then
#ifdef WITH_OPENMP
#ifdef WITH_MPI
call MPI_Wait(top_send_request(i), mpi_status, mpierr)
#endif
#else /* WITH_OPENMP */
#ifdef WITH_GPU_VERSION
! t1_mpi_wait_time =MPI_Wtime()
#endif
#ifdef WITH_MPI
call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
#endif
#ifdef WITH_GPU_VERSION
! t2_mpi_wait_time =MPI_Wtime()
! t0_mpi_wait_time = t0_mpi_wait_time+(t2_mpi_wait_time-t1_mpi_wait_time)
#endif
#endif /* WITH_OPENMP */
#ifdef WITH_OPENMP
b_len = csw*nbw*max_threads
top_border_send_buffer(1:b_len,i) = reshape(a(1:csw,a_off+1:a_off+nbw,i,:), (/ b_len /))
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Isend(top_border_send_buffer(1,i), b_len, MPI_COMPLEX16, &
my_prow-1, bottom_recv_tag, &
mpi_comm_rows, top_send_request(i), mpierr)
#else
call MPI_Isend(top_border_send_buffer(1,i), b_len, MPI_COMPLEX8, &
my_prow-1, bottom_recv_tag, &
mpi_comm_rows, top_send_request(i), mpierr)
#endif
#else /* WITH_MPI */
if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
endif
if (next_n_end < next_n) then
bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
endif
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
if (useGPU) then
! t1_memcpy_time =MPI_Wtime()
#ifdef DOUBLE_PRECISION_COMPLEX
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * &
size_of_double_complex_datatype
successCUDA = cuda_memcpy( loc(top_border_send_buffer(:,1,i)), a_dev + dev_offset, &
stripe_width*nbw*size_of_double_complex_datatype ,cudaMemcpyDeviceToHost)
#else
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * &
size_of_single_complex_datatype
successCUDA = cuda_memcpy( loc(top_border_send_buffer(:,1,i)), a_dev + dev_offset, &
stripe_width*nbw*size_of_single_complex_datatype ,cudaMemcpyDeviceToHost)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy"
stop
endif
! t2_memcpy_time =MPI_Wtime()
! t0_memcpy_time = t0_memcpy_time + (t2_memcpy_time-t1_memcpy_time)
!
else
top_border_send_buffer(:,1:nbw,i) = a(:,a_off+1:a_off+nbw,i)
endif
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow-1, bottom_recv_tag, &
mpi_comm_rows, top_send_request(i), mpierr)
#else
call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX8, my_prow-1, bottom_recv_tag, &
mpi_comm_rows, top_send_request(i), mpierr)
#endif
#else /* WITH_MPI */
if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
bottom_border_recv_buffer(1:nbw,1:stripe_width,i) = top_border_send_buffer(1:nbw,1:stripe_width,i)
endif
if (next_n_end < next_n) then
bottom_border_recv_buffer(1:nbw,1:stripe_width,i) = top_border_send_buffer(1:nbw,1:stripe_width,i)
endif
#endif /* WITH_MPI */
#endif /* WITH_OPENMP */
endif
! Care that there are not too many outstanding top_recv_request's
#ifdef WITH_GPU_VERSION
! t1_mpi_wait_time =MPI_Wtime()
#endif
if (stripe_count > 1) then
if (i>1) then
#ifdef WITH_MPI
#ifdef WITH_OPENMP
call MPI_Wait(top_recv_request(i-1), mpi_status, mpierr)
#else
call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr)
#endif
#endif /* WITH_MPI */
else
#ifdef WITH_MPI
#ifdef WITH_OPENMP
call MPI_Wait(top_recv_request(stripe_count), mpi_status, mpierr)
#else
call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr)
#endif
#endif /* WITH_MPI */
endif
endif
#ifdef WITH_GPU_VERSION
! t2_mpi_wait_time =MPI_Wtime()
! t0_mpi_wait_time = t0_mpi_wait_time+(t2_mpi_wait_time-t1_mpi_wait_time)
#endif
enddo
#ifdef WITH_GPU_VERSION
! t2_inner_do_time =MPI_Wtime()
! t0_inner_do_time = t0_inner_do_time + ( t2_inner_do_time - t1_inner_do_time)
#endif
top_msg_length = next_top_msg_length
else
! wait for last top_send_request
#ifdef WITH_GPU_VERSION
! t1_mpi_outer_wait_time =MPI_Wtime()
#endif
do i = 1, stripe_count
#ifdef WITH_MPI
#ifdef WITH_OPENMP
call MPI_Wait(top_send_request(i), mpi_status, mpierr)
#else
call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
#endif
#endif /* WITH_MPI */
enddo
#ifdef WITH_GPU_VERSION
! t2_mpi_outer_wait_time =MPI_Wtime()
! t0_mpi_outer_wait_time =t0_mpi_outer_wait_time+(t2_mpi_outer_wait_time-t1_mpi_outer_wait_time)
#endif
endif
#ifdef WITH_GPU_VERSION
! t0_result_time = MPI_Wtime()
#endif
! Care about the result
if (my_prow == 0) then
! topmost process sends nbw rows to destination processes
do j=0,nfact-1
num_blk = sweep*nfact+j ! global number of destination block, 0 based
if (num_blk*nblk >= na) exit
nbuf = mod(num_blk, num_result_buffers) + 1 ! buffer number to get this block
#ifdef WITH_MPI
#ifdef WITH_OPENMP
call MPI_Wait(result_send_request(nbuf), mpi_status, mpierr)
#else
call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr)
#endif
#endif /* WITH_MPI */
dst = mod(num_blk, np_rows)
if (dst == 0) then
if (useGPU) then
row_group_size = min(na - num_blk*nblk, nblk)
#ifdef DOUBLE_PRECISION_COMPLEX
call pack_row_group_complex_gpu_double(row_group(:, :), j * nblk + a_off,row_group_size)
#else
call pack_row_group_complex_gpu_single(row_group(:, :), j * nblk + a_off,row_group_size)
#endif
do i = 1, row_group_size
q((num_blk / np_rows) * nblk + i, 1 : l_nev) = row_group(:, i)
enddo
else
do i = 1, min(na - num_blk*nblk, nblk)
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef WITH_OPENMP
call pack_row_complex_cpu_openmp_double(a, row, j*nblk+i+a_off, &
stripe_width, stripe_count, max_threads, thread_width, l_nev)
#else
call pack_row_complex_cpu_double(a, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
#endif
#else
#ifdef WITH_OPENMP
call pack_row_complex_cpu_openmp_single(a, row, j*nblk+i+a_off, &
stripe_width, stripe_count, max_threads, thread_width, l_nev)
#else
call pack_row_complex_cpu_single(a, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
#endif
#endif
q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:)
enddo
endif
else
if (useGPU) then
#ifdef DOUBLE_PRECISION_COMPLEX
call pack_row_group_complex_gpu_double(result_buffer(:, :, nbuf), j * nblk + a_off, nblk)
#else
call pack_row_group_complex_gpu_single(result_buffer(:, :, nbuf), j * nblk + a_off, nblk)
#endif
else
do i = 1, nblk
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef WITH_OPENMP
call pack_row_complex_cpu_openmp_double(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, &
stripe_width, stripe_count, max_threads, thread_width, l_nev)
#else
call pack_row_complex_cpu_double(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, &
last_stripe_width, stripe_count)
#endif
#else
#ifdef WITH_OPENMP
call pack_row_complex_cpu_openmp_single(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, &
stripe_width, stripe_count, max_threads, thread_width, l_nev)
#else
call pack_row_complex_cpu_single(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, &
last_stripe_width, stripe_count)
#endif
#endif
enddo
endif
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, MPI_COMPLEX16, dst, &
result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr)
#else
call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, MPI_COMPLEX8, dst, &
result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr)
#endif
#else /* WITH_MPI */
if (j+num_result_buffers < num_result_blocks) &
result_buffer(1:l_nev,1:nblk,nbuf) = result_buffer(1:l_nev,1:nblk,nbuf)
if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
do j1 = 1, min(num_result_buffers, num_result_blocks)
result_buffer(1:l_nev,1:nblk,j1) = result_buffer(1:l_nev,1:nblk,nbuf)
enddo
endif
#endif /* WITH_MPI */
endif
enddo
else
! receive and store final result
do j = num_bufs_recvd, num_result_blocks-1
nbuf = mod(j, num_result_buffers) + 1 ! buffer number to get this block
! If there is still work to do, just test for the next result request
! and leave the loop if it is not ready, otherwise wait for all
! outstanding requests
if (next_local_n > 0) then
#ifdef WITH_MPI
#ifdef WITH_OPENMP
call MPI_Test(result_recv_request(nbuf), flag, mpi_status, mpierr)
#else
call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr)
#endif
#else /* WITH_MPI */
flag = .true.
#endif /* WITH_MPI */
if (.not.flag) exit
else
#ifdef WITH_MPI
#ifdef WITH_OPENMP
call MPI_Wait(result_recv_request(nbuf), mpi_status, mpierr)
#else
call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr)
#endif
#else /* WITH_MPI */
#endif /* WITH_MPI */
endif
! Fill result buffer into q
num_blk = j*np_rows + my_prow ! global number of current block, 0 based
do i = 1, min(na - num_blk*nblk, nblk)
q(j*nblk+i, 1:l_nev) = result_buffer(1:l_nev, i, nbuf)
enddo
! Queue result buffer again if there are outstanding blocks left
#ifdef WITH_MPI
if (j+num_result_buffers < num_result_blocks) &
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, MPI_COMPLEX16, 0, result_recv_tag, &
mpi_comm_rows, result_recv_request(nbuf), mpierr)
#else
call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, MPI_COMPLEX8, 0, result_recv_tag, &
mpi_comm_rows, result_recv_request(nbuf), mpierr)
#endif
#else /* WITH_MPI */
! carefull "recieve" has to be done at corresponding wait or send
! if (j+num_result_buffers < num_result_blocks) &
! result_buffer(1:l_nev*nblk,1,nbuf) = result_buffer(1:l_nev*nblk,1,nbuf)
#endif /* WITH_MPI */
enddo
num_bufs_recvd = j
endif
#ifdef WITH_GPU_VERSION
! t2_result_time =MPI_Wtime()
! t0_result_time = t0_result_time + ( t2_result_time - t1_result_time)
#endif
! Shift the remaining rows to the front of A (if necessary)
offset = nbw - top_msg_length
if (offset<0) then
if (wantDebug) then
write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: internal error, offset for shifting = ',offset
endif
success = .false.
return
endif
a_off = a_off + offset
if (a_off + next_local_n + nbw > a_dim2) then
#ifdef WITH_OPENMP
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("OpenMP parallel_double")
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
!$omp parallel do private(my_thread, i, j), schedule(static, 1)
do my_thread = 1, max_threads
do i = 1, stripe_count
do j = top_msg_length+1, top_msg_length+next_local_n
A(:,j,i,my_thread) = A(:,j+a_off,i,my_thread)
enddo
enddo
enddo
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("OpenMP parallel_double")
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#else /*WITH_OPENMP */
do i = 1, stripe_count
if (useGPU) then
chunk = min(next_local_n - 1, a_off)
do j = top_msg_length + 1, top_msg_length + next_local_n, chunk
top = min(j + chunk, top_msg_length + next_local_n)
this_chunk = top - j + 1
#ifdef DOUBLE_PRECISION_COMPLEX
dev_offset = (0 + ( (j-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * &
size_of_double_complex_datatype
dev_offset_1 = (0 + ( (j + a_off-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * &
size_of_double_complex_datatype
! it is not logical to set here always the parameter "cudaMemcpyDeviceToDevice" do this ONCE at startup
! tmp = cuda_d2d(1)
successCUDA = cuda_memcpy( a_dev + dev_offset , a_dev+dev_offset_1, &
stripe_width*this_chunk*size_of_double_complex_datatype, cudaMemcpyDeviceToDevice)
#else
dev_offset = (0 + ( (j-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * &
size_of_single_complex_datatype
dev_offset_1 = (0 + ( (j + a_off-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * &
size_of_single_complex_datatype
! it is not logical to set here always the parameter "cudaMemcpyDeviceToDevice" do this ONCE at startup
! tmp = cuda_d2d(1)
successCUDA = cuda_memcpy( a_dev + dev_offset , a_dev+dev_offset_1, &
stripe_width*this_chunk*size_of_single_complex_datatype, cudaMemcpyDeviceToDevice)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy"
stop
endif
enddo
else ! useGPU
do j = top_msg_length+1, top_msg_length+next_local_n
A(:,j,i) = A(:,j+a_off,i)
enddo
endif
enddo
#endif /*WITH_OPENMP */
a_off = 0
endif
enddo
#ifdef WITH_GPU_VERSION
! t2_outer_do_time =MPI_Wtime()
! t0_outer_do_time = t0_outer_do_time + ( t2_outer_do_time - t1_outer_do_time)
!
! istat = cuda_ProfilerStop()
#endif
! Just for safety:
#ifdef WITH_MPI
if (ANY(top_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_send_request ***',my_prow,my_pcol
if (ANY(bottom_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_send_request ***',my_prow,my_pcol
if (ANY(top_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_recv_request ***',my_prow,my_pcol
if (ANY(bottom_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_recv_request ***',my_prow,my_pcol
#endif
if (my_prow == 0) then
#ifdef WITH_MPI
#ifdef WITH_OPENMP
allocate(mpi_statuses(MPI_STATUS_SIZE,num_result_buffers), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating mpi_statuses "//errorMessage
stop
endif
call MPI_Waitall(num_result_buffers, result_send_request, mpi_statuses, mpierr)
deallocate(mpi_statuses, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating mpi_statuses "//errorMessage
stop
endif
#else
call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr)
#endif
#endif /* WITH_MPI */
endif
#ifdef WITH_MPI
if (ANY(result_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_send_request ***',my_prow,my_pcol
if (ANY(result_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_recv_request ***',my_prow,my_pcol
#endif
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,'(" Kernel time:",f10.3," MFlops: ",es12.5)') kernel_time, kernel_flops/kernel_time*1.d-6
! deallocate all working space
if (.not.(useGPU)) then
nullify(a)
call free(a_ptr)
! deallocate(a, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"trans_ev_tridi_to_band_complex: error deallocating a "//errorMessage
! stop
! endif
endif
deallocate(row, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating row "//errorMessage
stop
endif
deallocate(limits, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating limits "//errorMessage
stop
endif
deallocate(result_send_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating result_send_request "//errorMessage
stop
endif
deallocate(result_recv_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating result_recv_request "//errorMessage
stop
endif
deallocate(top_border_send_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating top_border_send_buffer "//errorMessage
stop
endif
deallocate(top_border_recv_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating top_border_recv_buffer "//errorMessage
stop
endif
deallocate(bottom_border_send_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating top_border_send_buffer "//errorMessage
stop
endif
deallocate(bottom_border_recv_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating bottom_border_recv_buffer "//errorMessage
stop
endif
deallocate(result_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating result_buffer "//errorMessage
stop
endif
deallocate(bcast_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating bcast_buffer "//errorMessage
stop
endif
deallocate(top_send_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating top_send_request "//errorMessage
stop
endif
deallocate(top_recv_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating top_recv_request "//errorMessage
stop
endif
deallocate(bottom_send_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating bottom_send_request "//errorMessage
stop
endif
deallocate(bottom_recv_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating bottom_recv_request "//errorMessage
stop
endif
if (useGPU) then
successCUDA = cuda_free(a_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaFree"
stop
endif
successCUDA = cuda_free(hh_tau_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaFree"
stop
endif
successCUDA = cuda_free(hh_dot_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaFree"
stop
endif
successCUDA = cuda_free(row_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaFree"
stop
endif
deallocate(row_group, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating row_group "//errorMessage
stop
endif
successCUDA= cuda_free(row_group_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaFree"
stop
endif
successCUDA = cuda_free(bcast_buffer_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaFree"
stop
endif
endif ! useGPU
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("trans_ev_tridi_to_band_complex_double")
#else
call timer%stop("trans_ev_tridi_to_band_complex_single")
#endif
#endif
return
contains
#ifdef DOUBLE_PRECISION_COMPLEX
! The host wrapper for extracting "tau" from the HH reflectors (see the
! kernel below)
subroutine extract_hh_tau_complex_gpu_double(nbw, n, is_zero)
#else
subroutine extract_hh_tau_complex_gpu_single(nbw, n, is_zero)
#endif
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), value :: nbw, n
logical, value :: is_zero
integer(kind=ik) :: val_is_zero
if (is_zero) then
val_is_zero = 1
else
val_is_zero = 0
endif
#ifdef DOUBLE_PRECISION_COMPLEX
call launch_extract_hh_tau_c_kernel_complex_double(bcast_buffer_dev,hh_tau_dev, nbw, n,val_is_zero)
#else
call launch_extract_hh_tau_c_kernel_complex_single(bcast_buffer_dev,hh_tau_dev, nbw, n,val_is_zero)
#endif
end subroutine
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine compute_hh_dot_products_complex_gpu_double(nbw, n)
#else
subroutine compute_hh_dot_products_complex_gpu_single(nbw, n)
#endif
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), value :: nbw, n
if (n .le. 1) return
#ifdef DOUBLE_PRECISION_COMPLEX
call launch_compute_hh_dotp_c_kernel_complex_double( bcast_buffer_dev, hh_dot_dev, nbw,n)
#else
call launch_compute_hh_dotp_c_kernel_complex_single( bcast_buffer_dev, hh_dot_dev, nbw,n)
#endif
end subroutine
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine pack_row_group_complex_gpu_double(rows, n_offset, row_count)
#else
subroutine pack_row_group_complex_gpu_single(rows, n_offset, row_count)
#endif
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), intent(in) :: n_offset, row_count
complex(kind=COMPLEX_DATATYPE) :: rows(:,:)
integer(kind=ik) :: max_idx
logical :: successCUDA
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
#ifdef DOUBLE_PRECISION_COMPLEX
call launch_my_pack_c_kernel_complex_double(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, &
l_nev, a_dev, row_group_dev)
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_double_complex_datatype, &
cudaMemcpyDeviceToHost)
#else
call launch_my_pack_c_kernel_complex_single(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, &
l_nev, a_dev, row_group_dev)
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_single_complex_datatype, &
cudaMemcpyDeviceToHost)
#endif
if (.not.(successCUDA)) then
print *,"pack_row_group_complex_gpu: error in cudaMemcpy"
stop
endif
end subroutine
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine unpack_row_group_complex_gpu_double(rows, n_offset, row_count)
#else
subroutine unpack_row_group_complex_gpu_single(rows, n_offset, row_count)
#endif
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), intent(in) :: n_offset, row_count
complex(kind=COMPLEX_DATATYPE), intent(in) :: rows(:, :)
integer(kind=ik) :: max_idx
integer(kind=ik) :: i
logical :: successCUDA
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_double_complex_datatype , &
cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_single_complex_datatype , &
cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"unpack_row_group_complex_gpu: error in cudaMemcpy"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
call launch_my_unpack_c_kernel_complex_double( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
row_group_dev,a_dev)
#else
call launch_my_unpack_c_kernel_complex_single( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
row_group_dev,a_dev)
#endif
end subroutine
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine unpack_and_prepare_row_group_complex_gpu_double(next_unpack_idx, force)
#else
subroutine unpack_and_prepare_row_group_complex_gpu_single(next_unpack_idx, force)
#endif
use precision
implicit none
integer(kind=ik), intent(in) :: next_unpack_idx
logical, intent(in) :: force
if (row_group_size == 0) then
! Nothing to flush, just prepare for the upcoming row
row_group_size = 1
else
if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /=next_unpack_idx)) then
! A flush and a reset must performed
#ifdef DOUBLE_PRECISION_COMPLEX
call unpack_row_group_complex_gpu_double(row_group(:, :), unpack_idx - row_group_size, row_group_size)
#else
call unpack_row_group_complex_gpu_single(row_group(:, :), unpack_idx - row_group_size, row_group_size)
#endif
row_group_size = 1
else
! Just prepare for the upcoming row
row_group_size = row_group_size + 1
endif
endif
! Always update the index for the upcoming row
unpack_idx = next_unpack_idx
end subroutine
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine compute_hh_trafo_complex_gpu_double(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2)
#else
subroutine compute_hh_trafo_complex_gpu_single(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2)
#endif
use iso_c_binding
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), intent(in) :: off, ncols, istripe
integer(kind=ik) :: nl
real(kind=c_double) :: ttt ! MPI_WTIME always needs double
integer(kind=ik) :: a_off
integer(kind=c_size_t) :: dev_offset, dev_offset_1, dev_offset_2
if (ncols < 1) return
ttt = mpi_wtime()
nl = merge(stripe_width, last_stripe_width, istripe < stripe_count)
#ifdef DOUBLE_PRECISION_COMPLEX
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * &
size_of_double_complex_datatype
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_double_complex_datatype
dev_offset_2 =( off-1 )*size_of_double_complex_datatype
! t1_compute_kernel =MPI_Wtime()
call launch_compute_hh_trafo_c_kernel_complex_double(a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
#else
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * &
size_of_single_complex_datatype
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_single_complex_datatype
dev_offset_2 =( off-1 )*size_of_single_complex_datatype
! t1_compute_kernel =MPI_Wtime()
call launch_compute_hh_trafo_c_kernel_complex_single(a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
#endif
! time0 = time0 + time1
! t2_compute_kernel =MPI_Wtime()
! t0_compute_kernel = t0_compute_kernel + t2_compute_kernel-t1_compute_kernel
kernel_flops = kernel_flops + 4 * int(nl, 8) * int(ncols, 8) * int(nbw,8)
kernel_time = kernel_time + mpi_wtime() - ttt
n_times =n_times +1
end subroutine
end subroutine