#if 0 ! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), fomerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! This particular source code file contains additions, changes and ! enhancements authored by Intel Corporation which is not part of ! the ELPA consortium. ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". ! ELPA2 -- 2-stage solver for ELPA ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". #endif #ifdef DOUBLE_PRECISION_COMPLEX subroutine bandred_complex_double(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, wantDebug, & useGPU, success) #else subroutine bandred_complex_single(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, wantDebug, & useGPU, success) #endif !------------------------------------------------------------------------------- ! bandred_complex: Reduces a distributed hermitian matrix to band form ! ! Parameters ! ! na Order of matrix ! ! a(lda,matrixCols) Distributed matrix which should be reduced. ! Distribution is like in Scalapack. ! Opposed to Scalapack, a(:,:) must be set completely (upper and lower half) ! a(:,:) is overwritten on exit with the band and the Householder vectors ! in the upper half. ! ! lda Leading dimension of a ! matrixCols local columns of matrix a ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! nbw semi bandwith of output matrix ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns ! ! tmat(nbw,nbw,numBlocks) where numBlocks = (na-1)/nbw + 1 ! Factors for the Householder vectors (returned), needed for back transformation ! !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision use cuda_functions use iso_c_binding implicit none logical, intent(in) :: useGPU integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=COMPLEX_DATATYPE) :: a(lda,*), tmat(nbw,nbw,*) #else complex(kind=COMPLEX_DATATYPE) :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks) #endif #ifdef DOUBLE_PRECISION_COMPLEX complex(kind=COMPLEX_DATATYPE), parameter :: CZERO = (0.0_rk8, 0.0_rk8), CONE = (1.0_rk8, 0.0_rk8) #else complex(kind=COMPLEX_DATATYPE), parameter :: CZERO = (0.0_rk4, 0.0_rk4), CONE = (1.0_rk4, 0.0_rk4) #endif integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: l_cols, l_rows integer(kind=ik) :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow integer(kind=ik) :: istep, ncol, lch, lcx, nlc integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile real(kind=REAL_DATATYPE) :: vnorm2 complex(kind=COMPLEX_DATATYPE) :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw) complex(kind=COMPLEX_DATATYPE), allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:) integer(kind=c_intptr_t) :: umc_dev, tmat_dev,vav_dev,vmr_dev,a_dev integer(kind=ik) :: cur_l_rows, cur_l_cols,vmr_size ,umc_size integer(kind=c_size_t) :: lc_start, lc_end, lr_end, lce_1, lcs_1,lre_1 integer(kind=ik) :: na_rows, na_cols #ifdef WITH_MPI integer(kind=ik), external :: numroc #endif logical, intent(in) :: wantDebug logical, intent(out) :: success character(200) :: errorMessage integer(kind=ik) :: istat logical :: successCUDA #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%start("bandred_complex_double") #else call timer%start("bandred_complex_single") #endif #endif call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) success = .true. ! Semibandwith nbw must be a multiple of blocksize nblk if (mod(nbw,nblk)/=0) then if (my_prow==0 .and. my_pcol==0) then if (wantDebug) then write(error_unit,*) 'ELPA2_bandred_complex: ERROR: nbw=',nbw,', nblk=',nblk write(error_unit,*) 'ELPA2_bandred_complex: ELPA2 works only for nbw==n*nblk' endif success = .false. return endif endif if (useGPU) then #ifdef WITH_MPI na_rows = numroc(na, nblk, my_prow, 0, np_rows) na_cols = numroc(na, nblk, my_pcol, 0, np_cols) #else na_rows = na na_cols = na #endif #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_double_complex_datatype) #else successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_single_complex_datatype) #endif if (.not.(successCUDA)) then print *, " bandred_complex: cuda malloc failed tmat_dev ", istat stop endif #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_malloc(vav_dev, nbw*nbw*size_of_double_complex_datatype) #else successCUDA = cuda_malloc(vav_dev, nbw*nbw*size_of_single_complex_datatype) #endif if (.not.(successCUDA)) then print *, "bandred_complex: cuda malloc failed vav_dev ", istat stop endif #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_malloc(a_dev, lda*na_cols*size_of_double_complex_datatype) #else successCUDA = cuda_malloc(a_dev, lda*na_cols*size_of_single_complex_datatype) #endif if (.not.(successCUDA)) then print *, "bandred_complex: cuda malloc failed a_dev ", istat stop endif endif ! useGPU ! Matrix is split into tiles; work is done only for tiles on the diagonal or above tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide l_rows_tile = tile_size/np_rows ! local rows of a tile l_cols_tile = tile_size/np_cols ! local cols of a tile if (useGPU) then if (size(a,dim=1) .ne. lda .or. size(a,dim=2) .ne. na_cols) then print *,"bandred_complex: sizes of a wrong ? ",lda,size(a,dim=1),na_cols,size(a,dim=2) endif #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_memcpy(a_dev, loc(a(1,1)),(lda)*(na_cols)*size_of_double_complex_datatype,cudaMemcpyHostToDevice) #else successCUDA = cuda_memcpy(a_dev, loc(a(1,1)),(lda)*(na_cols)*size_of_single_complex_datatype,cudaMemcpyHostToDevice) #endif if (.not.(successCUDA)) then print *, "bandred_complex: cuda memcpy faild a_dev ", istat stop endif endif do istep = (na-1)/nbw, 1, -1 n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step ! Number of local columns/rows of remaining matrix l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1) l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1) ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces if (useGPU) then cur_l_rows = max(l_rows, 1) cur_l_cols = max(l_cols, 1) vmr_size = cur_l_rows * 2 * n_cols umc_size = cur_l_cols * 2 * n_cols if ((.not. allocated(umc)) .or. (umc_size .gt. ubound(umc, dim=1))) then if (allocated(umc)) then deallocate(umc, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_complex: error when allocating umc "//errorMessage stop endif successCUDA = cuda_free(umc_dev) if (.not.(successCUDA))then print *,"bandred_complex: error in cudaFree" stop endif endif allocate(umc(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_complex: error when allocating umc "//errorMessage stop endif if (max(l_cols,1) * 2*n_cols .gt. umc_size) then print *,"bandred_complex: umc_size ",max(l_cols,1) * 2*n_cols,umc_size endif #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_malloc(umc_dev, umc_size*size_of_double_complex_datatype) #else successCUDA = cuda_malloc(umc_dev, umc_size*size_of_single_complex_datatype) #endif if (.not.(successCUDA)) then print *, "bandred_complex: cuda malloc failed umc_dev ", istat stop endif endif if ((.not. allocated(vmr)) .or. (vmr_size .gt. ubound(vmr, dim=1))) then if (allocated(vmr)) then deallocate(vmr, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_complex: error when deallocating vmr "//errorMessage stop endif successCUDA = cuda_free(vmr_dev) if (.not.(successCUDA))then print *,"bandred_complex: error in cudaFree" stop endif endif allocate(vmr(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_complex: error when allocating vmr "//errorMessage stop endif if (max(l_rows,1) * 2*n_cols .gt. vmr_size) then print *,"bandred_complex: vmc_size ",max(l_rows,1) * 2*n_cols,vmr_size endif #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_malloc(vmr_dev, vmr_size*size_of_double_complex_datatype) #else successCUDA = cuda_malloc(vmr_dev, vmr_size*size_of_single_complex_datatype) #endif if (.not.(successCUDA)) then print *, "bandred_complex: cuda malloc failed vmr_dev ", istat stop endif endif if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then if (allocated(vr)) then deallocate(vr, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_complex: error when deallocating vr "//errorMessage stop endif endif allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_complex: error when allocating vr "//errorMessage stop endif endif else ! GPU not used allocate(vmr(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_complex: error when allocating vmr "//errorMessage stop endif allocate(umc(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_complex: error when allocating umc "//errorMessage stop endif allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_complex: error when allocating vr "//errorMessage stop endif endif ! useGPU #ifdef DOUBLE_PRECISION_COMLEX vmr(1:l_rows,1:n_cols) = 0._ck8 vr(:) = 0._ck8 tmat(:,:,istep) = 0._ck8 #else vmr(1:l_rows,1:n_cols) = 0._ck4 vr(:) = 0._ck4 tmat(:,:,istep) = 0._ck4 #endif if (useGPU) then lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1) lc_end = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1) lr_end = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1) if (lc_start .le. 0) lc_start = 1 cur_pcol = pcol(istep*nbw+1, nblk, np_cols) if (my_pcol == cur_pcol) then #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), int(lda*size_of_double_complex_datatype,kind=c_size_t), & (a_dev + int( ( (lc_start-1) * lda*size_of_double_complex_datatype),kind=c_size_t )), & int(lda*size_of_double_complex_datatype,kind=c_size_t), & int(lr_end*size_of_double_complex_datatype,kind=c_size_t), & int((lc_end - lc_start+1),kind=c_size_t),int(cudaMemcpyDeviceToHost,kind=c_int)) #else successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), int(lda*size_of_single_complex_datatype,kind=c_size_t), & (a_dev + int( ( (lc_start-1) * lda*size_of_single_complex_datatype),kind=c_size_t )), & int(lda*size_of_single_complex_datatype,kind=c_size_t), & int(lr_end*size_of_single_complex_datatype,kind=c_size_t), & int((lc_end - lc_start+1),kind=c_size_t),int(cudaMemcpyDeviceToHost,kind=c_int)) #endif if (.not.(successCUDA)) then print *, "bandred_complex: error in cudaMemcpy2" stop endif endif endif ! Reduce current block to lower triangular form do lc = n_cols, 1, -1 ncol = istep*nbw + lc ! absolute column number of householder vector nrow = ncol - nbw ! Absolute number of pivot row lr = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number tau = 0 if(nrow == 1) exit ! Nothing to do cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block if (my_pcol==cur_pcol) then ! Get vector to be transformed; distribute last element and norm of ! remaining elements to all procs in current column vr(1:lr) = a(1:lr,lch) ! vector to be transformed if (my_prow==prow(nrow, nblk, np_rows)) then aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1)) aux1(2) = vr(lr) else aux1(1) = dot_product(vr(1:lr),vr(1:lr)) #ifdef DOUBLE_PRECISION_COMPLEX aux1(2) = 0._ck8 #else aux1(2) = 0._ck4 #endif endif #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call mpi_allreduce(aux1, aux2, 2, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) #else call mpi_allreduce(aux1, aux2, 2, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) #endif #else /* WITH_MPI */ aux2 = aux1 #endif /* WITH_MPI */ vnorm2 = aux2(1) vrl = aux2(2) ! Householder transformation #ifdef DOUBLE_PRECISION_COMPLEX call hh_transform_complex_double(vrl, vnorm2, xf, tau) #else call hh_transform_complex_single(vrl, vnorm2, xf, tau) #endif ! Scale vr and store Householder vector for back transformation vr(1:lr) = vr(1:lr) * xf if (my_prow==prow(nrow, nblk, np_rows)) then a(1:lr-1,lch) = vr(1:lr-1) a(lr,lch) = vrl #ifdef DOUBLE_PRECISION_COMPLEX vr(lr) = 1._ck8 #else vr(lr) = 1._ck4 #endif else a(1:lr,lch) = vr(1:lr) endif endif ! Broadcast Householder vector and tau along columns vr(lr+1) = tau #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call MPI_Bcast(vr, lr+1, MPI_DOUBLE_COMPLEX, cur_pcol, mpi_comm_cols, mpierr) #else call MPI_Bcast(vr, lr+1, MPI_COMPLEX, cur_pcol, mpi_comm_cols, mpierr) #endif #endif /* WITH_MPI */ vmr(1:lr,lc) = vr(1:lr) tau = vr(lr+1) tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat ! Transform remaining columns in current block with Householder vector ! Local dot product #ifdef DOUBLE_PRECISION_COMPLEX aux1 = 0._ck8 #else aux1 = 0._ck4 #endif nlc = 0 ! number of local columns do j=1,lc-1 lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) if (lcx>0) then nlc = nlc+1 aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx)) endif enddo ! Get global dot products #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) #else if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) #endif #else /* WITH_MPI */ if (nlc>0) aux2=aux1 #endif /* WITH_MPI */ ! Transform nlc = 0 do j=1,lc-1 lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) if (lcx>0) then nlc = nlc+1 a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr) endif enddo enddo ! Calculate scalar products of stored Householder vectors. ! This can be done in different ways, we use zherk if (useGPU) then cur_pcol = pcol(istep*nbw+1, nblk, np_cols) if (my_pcol == cur_pcol) then #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_memcpy2d((a_dev+int(((lc_start-1)*lda*size_of_double_complex_datatype),kind=c_size_t)), & int(lda*size_of_double_complex_datatype,kind=c_size_t), loc(a(1,lc_start)), & int(lda*size_of_double_complex_datatype,kind=c_size_t), & int(lr_end*size_of_double_complex_datatype,kind=c_size_t), & int((lc_end - lc_start+1),kind=c_size_t) & ,int(cudaMemcpyHostToDevice,kind=c_int)) #else successCUDA = cuda_memcpy2d((a_dev+int(((lc_start-1)*lda*size_of_single_complex_datatype),kind=c_size_t)), & int(lda*size_of_single_complex_datatype,kind=c_size_t), loc(a(1,lc_start)), & int(lda*size_of_single_complex_datatype,kind=c_size_t), & int(lr_end*size_of_single_complex_datatype,kind=c_size_t), & int((lc_end - lc_start+1),kind=c_size_t) & ,int(cudaMemcpyHostToDevice,kind=c_int)) #endif if (.not.(successCUDA)) then print *, "bandred_complex: cuda memcpy a_dev failed ", istat stop endif endif endif vav = 0 if (l_rows>0) & #ifdef DOUBLE_PRECISION_COMPLEX call zherk('U', 'C', n_cols, l_rows, CONE, vmr, ubound(vmr,dim=1), CZERO, vav, ubound(vav,dim=1)) call herm_matrix_allreduce_double(n_cols,vav, nbw,nbw,mpi_comm_rows) #else call cherk('U', 'C', n_cols, l_rows, CONE, vmr, ubound(vmr,dim=1), CZERO, vav, ubound(vav,dim=1)) call herm_matrix_allreduce_single(n_cols,vav, nbw,nbw,mpi_comm_rows) #endif ! Calculate triangular matrix T for block Householder Transformation do lc=n_cols,1,-1 tau = tmat(lc,lc,istep) if (lc vmc (stored in umc, second half) #ifdef DOUBLE_PRECISION_COMPLEX call elpa_transpose_vectors_complex_double (vmr, ubound(vmr,dim=1), mpi_comm_rows, & umc(1,n_cols+1), ubound(umc,dim=1), mpi_comm_cols, & 1, istep*nbw, n_cols, nblk) #else call elpa_transpose_vectors_complex_single (vmr, ubound(vmr,dim=1), mpi_comm_rows, & umc(1,n_cols+1), ubound(umc,dim=1), mpi_comm_cols, & 1, istep*nbw, n_cols, nblk) #endif ! Calculate umc = A**T * vmr ! Note that the distributed A has to be transposed ! Opposed to direct tridiagonalization there is no need to use the cache locality ! of the tiles, so we can use strips of the matrix #ifdef DOUBLE_PRECISION_COMPLEX umc(1:l_cols,1:n_cols) = 0.0_ck8 vmr(1:l_rows,n_cols+1:2*n_cols) = 0._ck8 #else umc(1:l_cols,1:n_cols) = 0.0_ck4 vmr(1:l_rows,n_cols+1:2*n_cols) = 0._ck4 #endif if (l_cols>0 .and. l_rows>0) then if (useGPU) then if (size(vmr,dim=1)*size(vmr,dim=2) .gt. vmr_size) then print *,"bandred_complex: vmr size 2 :",size(vmr,dim=1)*size(vmr,dim=2),vmr_size stop endif #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_memcpy(vmr_dev, loc(vmr(1,1)),vmr_size*size_of_double_complex_datatype,cudaMemcpyHostToDevice) #else successCUDA = cuda_memcpy(vmr_dev, loc(vmr(1,1)),vmr_size*size_of_single_complex_datatype,cudaMemcpyHostToDevice) #endif if (.not.(successCUDA)) then print *, "bandred_complex: cuda memcpy vmr_dev failed ", istat stop endif if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then print *,"bandred_complex: umc size 2 :",size(umc,dim=1)*size(umc,dim=2),umc_size stop endif #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_memcpy(umc_dev, loc(umc(1,1)),umc_size*size_of_double_complex_datatype,cudaMemcpyHostToDevice) #else successCUDA = cuda_memcpy(umc_dev, loc(umc(1,1)),umc_size*size_of_single_complex_datatype,cudaMemcpyHostToDevice) #endif if (.not.(successCUDA)) then print *, "bandred_complex: cuda memcpy umc_dev failed ", istat stop endif endif do i=0,(istep*nbw-1)/tile_size lcs = i*l_cols_tile+1 lce = min(l_cols,(i+1)*l_cols_tile) if (lce0) then allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_complex: error when allocating tmp "//errorMessage stop endif #ifdef DOUBLE_PRECISION_COMPLEX call mpi_allreduce(umc, tmp, l_cols*n_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) #else call mpi_allreduce(umc, tmp, l_cols*n_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) #endif umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols) deallocate(tmp, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_complex: error when deallocating tmp "//errorMessage stop endif endif #else /* WITH_MPI */ if (l_cols>0) then allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_complex: error when allocating tmp "//errorMessage stop endif tmp(1:l_cols,1:n_cols) = umc(1:l_cols,1:n_cols) umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols) deallocate(tmp, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_complex: error when deallocating tmp "//errorMessage stop endif endif #endif /* WITH_MPI */ ! U = U * Tmat**T if (useGPU) then if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then print *,"bandred_complex: umc size 4 :",size(umc,dim=1)*size(umc,dim=2),umc_size stop endif #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_memcpy(umc_dev, loc(umc(1,1)),umc_size*size_of_double_complex_datatype,cudaMemcpyHostToDevice) #else successCUDA = cuda_memcpy(umc_dev, loc(umc(1,1)),umc_size*size_of_single_complex_datatype,cudaMemcpyHostToDevice) #endif if (.not.(successCUDA)) then print *, "bandred_complex: cuad memcpy failed umc_dev ", istat stop endif #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*size_of_double_complex_datatype,cudaMemcpyHostToDevice) #else successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*size_of_single_complex_datatype,cudaMemcpyHostToDevice) #endif if (.not.(successCUDA)) then print *, "bandred_complex: cuad memcpy failed tmat_dev ", istat stop endif #ifdef DOUBLE_PRECISION_COMPLEX call cublas_ztrmm('Right', 'Upper', 'C', 'Nonunit', l_cols, n_cols, CONE, tmat_dev, nbw, umc_dev, cur_l_cols) #else call cublas_ctrmm('Right', 'Upper', 'C', 'Nonunit', l_cols, n_cols, CONE, tmat_dev, nbw, umc_dev, cur_l_cols) #endif else ! not useGPU #ifdef DOUBLE_PRECISION_COMPLEX call ztrmm('Right', 'Upper', 'C', 'Nonunit', l_cols, n_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), & umc, ubound(umc,dim=1)) #else call ctrmm('Right', 'Upper', 'C', 'Nonunit', l_cols, n_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), & umc, ubound(umc,dim=1)) #endif endif ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T if (useGPU) then #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_double_complex_datatype,cudaMemcpyHostToDevice) #else successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_single_complex_datatype,cudaMemcpyHostToDevice) #endif if (.not.(successCUDA)) then print *, "bandred_complex: cuad memcpy failed vav_dev ", istat stop endif #ifdef DOUBLE_PRECISION_COMPLEX call cublas_zgemm('C', 'N', n_cols, n_cols, l_cols, CONE, umc_dev, cur_l_cols, (umc_dev +( cur_l_cols *n_cols) & *size_of_double_complex_datatype ), cur_l_cols, CZERO, vav_dev, nbw) call cublas_ztrmm('Right', 'Upper', 'C', 'Nonunit', n_cols, n_cols, CONE, tmat_dev, nbw, vav_dev, nbw) #else call cublas_cgemm('C', 'N', n_cols, n_cols, l_cols, CONE, umc_dev, cur_l_cols, (umc_dev +( cur_l_cols *n_cols) & *size_of_single_complex_datatype ), cur_l_cols, CZERO, vav_dev, nbw) call cublas_ctrmm('Right', 'Upper', 'C', 'Nonunit', n_cols, n_cols, CONE, tmat_dev, nbw, vav_dev, nbw) #endif #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev,nbw*nbw*size_of_double_complex_datatype,cudaMemcpyDeviceToHost) #else successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev,nbw*nbw*size_of_single_complex_datatype,cudaMemcpyDeviceToHost) #endif if (.not.(successCUDA)) then print *, "bandred_complex: cuad memcpy failed vav ", istat stop endif #ifdef DOUBLE_PRECISION_COMPLEX call herm_matrix_allreduce_double(n_cols,vav, nbw, nbw,mpi_comm_cols) #else call herm_matrix_allreduce_single(n_cols,vav, nbw, nbw,mpi_comm_cols) #endif #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)),nbw*nbw*size_of_double_complex_datatype,cudaMemcpyHostToDevice) #else successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)),nbw*nbw*size_of_single_complex_datatype,cudaMemcpyHostToDevice) #endif if (.not.(successCUDA)) then print *, "bandred_complex: cuad memcpy failed vav_dev ", istat stop endif else #ifdef DOUBLE_PRECISION_COMPLEX call zgemm('C', 'N', n_cols, n_cols, l_cols, CONE, umc, ubound(umc,dim=1), umc(1,n_cols+1), & ubound(umc,dim=1), CZERO, vav, ubound(vav,dim=1)) call ztrmm('Right', 'Upper', 'C', 'Nonunit', n_cols, n_cols, CONE, tmat(1,1,istep), & ubound(tmat,dim=1), vav, ubound(vav,dim=1)) #else call cgemm('C', 'N', n_cols, n_cols, l_cols, CONE, umc, ubound(umc,dim=1), umc(1,n_cols+1), & ubound(umc,dim=1), CZERO, vav, ubound(vav,dim=1)) call ctrmm('Right', 'Upper', 'C', 'Nonunit', n_cols, n_cols, CONE, tmat(1,1,istep), & ubound(tmat,dim=1), vav, ubound(vav,dim=1)) #endif #ifdef DOUBLE_PRECISION_COMPLEX call herm_matrix_allreduce_double(n_cols,vav,nbw,nbw,mpi_comm_cols) #else call herm_matrix_allreduce_single(n_cols,vav,nbw,nbw,mpi_comm_cols) #endif endif ! U = U - 0.5 * V * VAV if (useGPU) then #ifdef DOUBLE_PRECISION_COMPLEX call cublas_zgemm('N', 'N', l_cols, n_cols, n_cols, (-0.5_rk8, 0.0_rk8), (umc_dev + & (cur_l_cols * n_cols )*size_of_double_complex_datatype), & cur_l_cols, vav_dev, nbw, CONE, umc_dev, cur_l_cols) #else call cublas_cgemm('N', 'N', l_cols, n_cols, n_cols, (-0.5_rk4, 0.0_rk4), (umc_dev + & (cur_l_cols * n_cols )*size_of_single_complex_datatype), & cur_l_cols, vav_dev, nbw, CONE, umc_dev, cur_l_cols) #endif ! Transpose umc -> umr (stored in vmr, second half) if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then print *,"bandred_complex: umc size 5 :",size(umc,dim=1)*size(umc,dim=2),umc_size stop endif #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_memcpy(loc(umc(1,1)),umc_dev,umc_size*size_of_double_complex_datatype,cudaMemcpyDeviceToHost) #else successCUDA = cuda_memcpy(loc(umc(1,1)),umc_dev,umc_size*size_of_single_complex_datatype,cudaMemcpyDeviceToHost) #endif if (.not.(successCUDA)) then print *, "bandred_complex: cuad memcpy failed umc ", istat stop endif #ifdef DOUBLE_PRECISION_COMPLEX call elpa_transpose_vectors_complex_double (umc, ubound(umc,dim=1), mpi_comm_cols, & vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, & 1, istep*nbw, n_cols, nblk) #else call elpa_transpose_vectors_complex_single (umc, ubound(umc,dim=1), mpi_comm_cols, & vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, & 1, istep*nbw, n_cols, nblk) #endif if (size(vmr,dim=1)*size(vmr,dim=2) .gt. vmr_size) then print *,"bandred_complex: vmr size 4 :",size(vmr,dim=1)*size(vmr,dim=2),vmr_size stop endif #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_memcpy(vmr_dev,loc(vmr(1,1)),vmr_size*size_of_double_complex_datatype,cudaMemcpyHostToDevice) #else successCUDA = cuda_memcpy(vmr_dev,loc(vmr(1,1)),vmr_size*size_of_single_complex_datatype,cudaMemcpyHostToDevice) #endif if (.not.(successCUDA)) then print *, "bandred_complex: cuda memcpy failed vav_dev", istat stop endif if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then print *,"bandred_complex: umc size 6 :",size(umc,dim=1)*size(umc,dim=2),umc_size stop endif #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_memcpy(umc_dev,loc(umc(1,1)),umc_size*size_of_double_complex_datatype,cudaMemcpyHostToDevice) #else successCUDA = cuda_memcpy(umc_dev,loc(umc(1,1)),umc_size*size_of_single_complex_datatype,cudaMemcpyHostToDevice) #endif if (.not.(successCUDA)) then print *, "bandred_complex: cuda memcpy failed umc_dev ", istat stop endif else ! not useGPU #ifdef DOUBLE_PRECISION_COMPLEX call zgemm('N', 'N', l_cols, n_cols, n_cols, (-0.5_rk8, 0.0_rk8), umc(1,n_cols+1), ubound(umc,dim=1), & vav, ubound(vav,dim=1), CONE, umc, ubound(umc,dim=1)) #else call cgemm('N', 'N', l_cols, n_cols, n_cols, (-0.5_rk4, 0.0_rk4), umc(1,n_cols+1), ubound(umc,dim=1), & vav, ubound(vav,dim=1), CONE, umc, ubound(umc,dim=1)) #endif ! Transpose umc -> umr (stored in vmr, second half) #ifdef DOUBLE_PRECISION_COMPLEX call elpa_transpose_vectors_complex_double (umc, ubound(umc,dim=1), mpi_comm_cols, & vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, & 1, istep*nbw, n_cols, nblk) #else call elpa_transpose_vectors_complex_single (umc, ubound(umc,dim=1), mpi_comm_cols, & vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, & 1, istep*nbw, n_cols, nblk) #endif endif ! A = A - V*U**T - U*V**T do i=0,(istep*nbw-1)/tile_size lcs = i*l_cols_tile+1 lce = min(l_cols,(i+1)*l_cols_tile) lre = min(l_rows,(i+1)*l_rows_tile) if (lce 0) then if (useGPU) then #ifdef DOUBLE_PRECISION_COMPLEX call cublas_zgemm('C', 'N', n_cols, l_cols, l_rows, CONE, hvm_dev, max_local_rows, & q_dev, ldq, CZERO, tmp_dev, n_cols) successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, n_cols*l_cols*size_of_double_complex_datatype, & cudaMemcpyDeviceToHost) #else call cublas_cgemm('C', 'N', n_cols, l_cols, l_rows, CONE, hvm_dev, max_local_rows, & q_dev, ldq, CZERO, tmp_dev, n_cols) successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, n_cols*l_cols*size_of_single_complex_datatype, & cudaMemcpyDeviceToHost) #endif if (.not.(successCUDA)) then print *,"trans_ev_band_to_full_complex: error in cudaMemcpy" stop endif else #ifdef DOUBLE_PRECISION_COMPLEX call zgemm('C', 'N', n_cols, l_cols, l_rows, CONE, hvm, ubound(hvm,dim=1), & q, ldq, CZERO, tmp1, n_cols) #else call cgemm('C', 'N', n_cols, l_cols, l_rows, CONE, hvm, ubound(hvm,dim=1), & q, ldq, CZERO, tmp1, n_cols) #endif endif else ! l_rows > 0 if (useGPU) then if (l_cols*n_cols .gt. (max_local_cols)*(nbw)) then print *,"trans_ev_band_to_full_complex: tmp_dev ",l_cols*n_cols,max_local_cols stop endif ! istat = cuda_memset(tmp_dev, 0, l_cols*n_cols*size_of_complex_datatype) ! if (istat .ne. 0) then ! print *,"trans_ev_band_to_full_complex: error in cudaMemset" ! stop ! endif endif #ifdef DOUBLE_PRECISION_COMPLEX tmp1(1:l_cols*n_cols) = 0._ck8 #else tmp1(1:l_cols*n_cols) = 0._ck4 #endif endif #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) #else call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) #endif #else /* WITH_MPI */ tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols) #endif /* WITH_MPI */ if (l_rows>0) then if (useGPU) then #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_memcpy(tmp_dev,loc(tmp2),l_cols*n_cols*size_of_double_complex_datatype,cudaMemcpyHostToDevice) #else successCUDA = cuda_memcpy(tmp_dev,loc(tmp2),l_cols*n_cols*size_of_single_complex_datatype,cudaMemcpyHostToDevice) #endif if (.not.(successCUDA)) then print *,"trans_ev_band_to_full_complex: error in cudaMemcpy" stop endif ! tmat_temp(1:nbw,1:nbw) = tmat(1:nbw,1:nbw,istep) #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)),nbw*nbw* & size_of_double_complex_datatype,cudaMemcpyHostToDevice) #else successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)),nbw*nbw* & size_of_single_complex_datatype,cudaMemcpyHostToDevice) #endif if (.not.(successCUDA)) then print *,"trans_ev_band_to_full_complex: error in cudaMemcpy" stop endif #ifdef DOUBLE_PRECISION_COMPLEX call cublas_ztrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat_dev, nbw, tmp_dev, n_cols) call cublas_zgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm_dev, max_local_rows, & tmp_dev, n_cols, CONE, q_dev, ldq) #else call cublas_ctrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat_dev, nbw, tmp_dev, n_cols) call cublas_cgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm_dev, max_local_rows, & tmp_dev, n_cols, CONE, q_dev, ldq) #endif else ! not useGPU #ifdef DOUBLE_PRECISION_COMPLEX call ztrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp2, n_cols) call zgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), & tmp2, n_cols, CONE, q, ldq) #else call ctrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp2, n_cols) call cgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), & tmp2, n_cols, CONE, q, ldq) #endif endif endif !#ifdef WITH_GPU_VERSION ! istat =cuda_memcpy(loc(hvm(1,1)),hvm_dev,((max_local_rows)*nbw*size_of_complex_datatype),cudaMemcpyDeviceToHost) ! if (istat .ne. 0) then ! print *,"trans_ev_band_to_full_complex: error in cudaMemcpy" ! stop ! endif !#endif enddo deallocate(tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_band_to_full_complex: error when deallocating tmp1, tmp2, hvb, hvm "//errorMessage stop endif if (useGPU) then successCUDA = cuda_free(hvm_dev) if (.not.(successCUDA)) then print *,"trans_ev_band_to_full_complex: error in cudaFree" stop endif successCUDA = cuda_free(tmp_dev) if (.not.(successCUDA)) then print *,"trans_ev_band_to_full_complex: error in cudaFree" stop endif successCUDA = cuda_free(tmat_dev) if (.not.(successCUDA)) then print *,"trans_ev_band_to_full_complex: error in cudaFree" stop endif #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_memcpy(loc(q), q_dev,ldq*matrixCols*size_of_double_complex_datatype, cudaMemcpyDeviceToHost) #else successCUDA = cuda_memcpy(loc(q), q_dev,ldq*matrixCols*size_of_single_complex_datatype, cudaMemcpyDeviceToHost) #endif if (.not.(successCUDA)) then print *,"trans_ev_band_to_full_complex: error in cudaMemcpy" stop endif ! q(1:ldq,1:na_cols) = q_temp(1:ldq,1:na_cols) successCUDA = cuda_free(q_dev) if (.not.(successCUDA)) then print *,"trans_ev_band_to_full_complex: error in cudaFree" stop endif ! deallocate(q_temp, stat=istat, errmsg=errorMessage) ! if (istat .ne. 0) then ! print *,"trans_ev_band_to_full_complex: error when deallocating q_temp "//errorMessage ! endif !deallocate(tmat_temp, stat=istat, errmsg=errorMessage) !if (istat .ne. 0) then !print *,"trans_ev_band_to_full_complex: error when deallocating tmat_temp "//errorMessage !endif endif ! use GPU #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%stop("trans_ev_band_to_full_complex_double") #else call timer%stop("trans_ev_band_to_full_complex_single") #endif #endif #ifdef DOUBLE_PRECISION_COMPLEX end subroutine trans_ev_band_to_full_complex_double #else end subroutine trans_ev_band_to_full_complex_single #endif #ifdef DOUBLE_PRECISION_COMPLEX subroutine tridiag_band_complex_double(na, nb, nblk, a, lda, d, e, matrixCols, hh_trans_complex, & mpi_comm_rows, mpi_comm_cols, mpi_comm) #else subroutine tridiag_band_complex_single(na, nb, nblk, a, lda, d, e, matrixCols, hh_trans_complex, & mpi_comm_rows, mpi_comm_cols, mpi_comm) #endif !------------------------------------------------------------------------------- ! tridiag_band_complex: ! Reduces a complex hermitian symmetric band matrix to tridiagonal form ! ! na Order of matrix a ! ! nb Semi bandwith ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! a(lda,matrixCols) Distributed system matrix reduced to banded form in the upper diagonal ! ! lda Leading dimension of a ! matrixCols local columns of matrix a ! ! d(na) Diagonal of tridiagonal matrix, set only on PE 0 (output) ! ! e(na) Subdiagonal of tridiagonal matrix, set only on PE 0 (output) ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns ! mpi_comm ! MPI-Communicator for the total processor set !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none !#ifdef WITH_GPU_VERSION ! integer(C_SIZE_T) :: h_dev, hv_new_dev ,ab_dev,x_dev,hs_dev,tau_new_dev,hv_dev,hd_dev ! complex*16, allocatable :: ab_temp(:,:) !#endif integer(kind=ik), intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=COMPLEX_DATATYPE),intent(in) :: a(lda,*) #else complex(kind=COMPLEX_DATATYPE), intent(in) :: a(lda,matrixCols) #endif real(kind=REAL_DATATYPE), intent(out) :: d(na), e(na) ! set only on PE 0 complex(kind=COMPLEX_DATATYPE), intent(inout), & allocatable :: hh_trans_complex(:,:) real(kind=REAL_DATATYPE) :: vnorm2 complex(kind=COMPLEX_DATATYPE) :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf complex(kind=COMPLEX_DATATYPE) :: hd(nb), hs(nb) integer(kind=ik) :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt integer(kind=ik) :: my_pe, n_pes, mpierr integer(kind=ik) :: my_prow, np_rows, my_pcol, np_cols integer(kind=ik) :: ireq_ab, ireq_hv integer(kind=ik) :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off #ifdef WITH_OPENMP integer(kind=ik), allocatable :: mpi_statuses(:,:) integer(kind=ik), allocatable :: omp_block_limits(:) integer(kind=ik) :: max_threads, my_thread, my_block_s, my_block_e, iter integer(kind=ik) :: omp_get_max_threads #ifdef WITH_MPI integer(kind=ik) :: mpi_status(MPI_STATUS_SIZE) #endif complex(kind=COMPLEX_DATATYPE), allocatable :: hv_t(:,:), tau_t(:) #endif integer(kind=ik), allocatable :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:) integer(kind=ik), allocatable :: limits(:), snd_limits(:,:) integer(kind=ik), allocatable :: block_limits(:) complex(kind=COMPLEX_DATATYPE), allocatable :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:) integer(kind=ik) :: istat character(200) :: errorMessage #ifndef WITH_MPI integer(kind=ik) :: startAddr #endif ! ! dummies for calling redist_band ! real*8 :: r_a(1,1), r_ab(1,1) #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%start("tridiag_band_complex_double") #else call timer%start("tridiag_band_complex_single") #endif #endif call mpi_comm_rank(mpi_comm,my_pe,mpierr) call mpi_comm_size(mpi_comm,n_pes,mpierr) call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) !#ifdef WITH_GPU_VERSION ! t_1 = 0 ! t_2 = 0 !#endif ! Get global_id mapping 2D procssor coordinates to global id allocate(global_id(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating global_id "//errorMessage stop endif global_id(:,:) = 0 global_id(my_prow, my_pcol) = my_pe #ifdef WITH_MPI call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr) #endif ! Total number of blocks in the band: nblocks_total = (na-1)/nb + 1 ! Set work distribution allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating block_limits "//errorMessage stop endif call divide_band(nblocks_total, n_pes, block_limits) ! nblocks: the number of blocks for my task nblocks = block_limits(my_pe+1) - block_limits(my_pe) ! allocate the part of the band matrix which is needed by this PE ! The size is 1 block larger than needed to avoid extensive shifts allocate(ab(2*nb,(nblocks+1)*nb), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating ab "//errorMessage stop endif !#ifdef WITH_GPU_VERSION ! allocate(ab_temp(2*nb,nblocks*nb), stat=istat, errmsg=errorMessage) ! if (istat .ne. 0) then ! print *,"error when allocating ab_temp "//errorMessage ! stop ! endif !#endif ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety !#ifdef WITH_GPU_VERSION ! ! istat = cuda_malloc(ab_dev, 2*nb*(nblocks+1)*nb*size_of_complex_datatype) ! if (istat .ne. 0) print *, " cuda malloc failed ab_dev", istat ! ! istat = cuda_malloc(hv_new_dev, nb*size_of_complex_datatype ) ! if (istat .ne. 0) print *, " cuda malloc failed hv_new_dev", istat ! !! istat = cuda_malloc(temp_c_dev, nb*nb*size_of_complex_datatype ) !! if(istat .ne. 0) print *, " cuda malloc failed temp_c", istat ! ! istat = cuda_malloc(h_dev , nb*size_of_complex_datatype) ! if (istat .ne. 0) print *, " cuda malloc failed h_dev", istat ! ! istat = cuda_malloc(hs_dev , nb*size_of_complex_datatype) ! if (istat .ne. 0) print *, " cuda malloc failed hs_dev", istat ! ! istat = cuda_malloc(x_dev , 1*size_of_complex_datatype) ! if (istat .ne. 0) print *, " cuda malloc failed x_dev", istat ! ! istat = cuda_malloc( tau_new_dev , 1*size_of_complex_datatype) ! if (istat .ne. 0) print *, " cuda malloc failed tau_new_dev", istat ! ! istat = cuda_malloc(hv_dev , nb*size_of_complex_datatype) ! if (istat .ne. 0) print *, " cuda malloc failed hv_dev", istat ! ! istat = cuda_malloc(hd_dev , nb*size_of_complex_datatype) ! if (istat .ne. 0) print *, " cuda malloc failed hd_dev", istat !#endif ! n_off: Offset of ab within band n_off = block_limits(my_pe)*nb ! Redistribute band in a to ab #ifdef DOUBLE_PRECISION_COMPLEX call redist_band_complex_double(a, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, ab) #else call redist_band_complex_single(a, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, ab) #endif ! Calculate the workload for each sweep in the back transformation ! and the space requirements to hold the HH vectors allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating limits "//errorMessage stop endif call determine_workload(na, nb, np_rows, limits) max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1)) num_hh_vecs = 0 num_chunks = 0 nx = na do n = 1, nblocks_total call determine_workload(nx, nb, np_rows, limits) local_size = limits(my_prow+1) - limits(my_prow) ! add to number of householder vectors ! please note: for nx==1 the one and only HH vector is 0 and is neither calculated nor send below! if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then num_hh_vecs = num_hh_vecs + local_size num_chunks = num_chunks+1 endif nx = nx - nb enddo ! Allocate space for HH vectors allocate(hh_trans_complex(nb,num_hh_vecs), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating hh_trans_comples "//errorMessage stop endif ! Allocate and init MPI requests allocate(ireq_hhr(num_chunks), stat=istat, errmsg=errorMessage) ! Recv requests if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating ireq_hhr "//errorMessage stop endif allocate(ireq_hhs(nblocks), stat=istat, errmsg=errorMessage) ! Send requests if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating ireq_hhs "//errorMessage stop endif num_hh_vecs = 0 num_chunks = 0 nx = na nt = 0 do n = 1, nblocks_total call determine_workload(nx, nb, np_rows, limits) local_size = limits(my_prow+1) - limits(my_prow) if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then num_chunks = num_chunks+1 #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call mpi_irecv(hh_trans_complex(1,num_hh_vecs+1), nb*local_size, MPI_COMPLEX16, nt, & 10+n-block_limits(nt), mpi_comm, ireq_hhr(num_chunks), mpierr) #else call mpi_irecv(hh_trans_complex(1,num_hh_vecs+1), nb*local_size, MPI_COMPLEX8, nt, & 10+n-block_limits(nt), mpi_comm, ireq_hhr(num_chunks), mpierr) #endif #else /* WITH_MPI */ ! carefull non-block recv data copy must be done at wait or send ! hh_trans_complex(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk) #endif /* WITH_MPI */ num_hh_vecs = num_hh_vecs + local_size endif nx = nx - nb if (n == block_limits(nt+1)) then nt = nt + 1 endif enddo #ifdef WITH_MPI ireq_hhs(:) = MPI_REQUEST_NULL #endif ! Buffers for gathering/sending the HH vectors allocate(hh_gath(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! gathers HH vectors if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating hh_gath "//errorMessage stop endif allocate(hh_send(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! send buffer for HH vectors if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating hh_sebd "//errorMessage stop endif #ifdef DOUBLE_PRECISION_COMPLEX hh_gath(:,:,:) = 0._ck8 hh_send(:,:,:) = 0._ck8 #else hh_gath(:,:,:) = 0._ck4 hh_send(:,:,:) = 0._ck4 #endif ! Some counters allocate(hh_cnt(nblocks), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating hh_cnt "//errorMessage stop endif allocate(hh_dst(nblocks), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating hh_dst "//errorMessage stop endif hh_cnt(:) = 1 ! The first transfomation vector is always 0 and not calculated at all hh_dst(:) = 0 ! PE number for receive #ifdef WITH_MPI ireq_ab = MPI_REQUEST_NULL ireq_hv = MPI_REQUEST_NULL #endif ! Limits for sending allocate(snd_limits(0:np_rows,nblocks), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating snd_limits "//errorMessage stop endif do iblk=1,nblocks call determine_workload(na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk)) enddo #ifdef WITH_OPENMP ! OpenMP work distribution: max_threads = 1 !$ max_threads = omp_get_max_threads() ! For OpenMP we need at least 2 blocks for every thread max_threads = MIN(max_threads, nblocks/2) if (max_threads==0) max_threads = 1 allocate(omp_block_limits(0:max_threads), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating omp_block_limits "//errorMessage stop endif ! Get the OpenMP block limits call divide_band(nblocks, max_threads, omp_block_limits) allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating hv_t, tau_t "//errorMessage stop endif #ifdef DOUBLE_PRECISION_COMPLEX hv_t = 0._ck8 tau_t = 0._ck8 #else hv_t = 0._ck4 tau_t = 0._ck4 #endif #endif ! --------------------------------------------------------------------------- ! Start of calculations na_s = block_limits(my_pe)*nb + 1 if (my_pe>0 .and. na_s<=na) then ! send first column to previous PE ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also) ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off) #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call mpi_isend(ab_s, nb+1, MPI_COMPLEX16, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) #else call mpi_isend(ab_s, nb+1, MPI_COMPLEX8, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) #endif #endif /* WITH_MPI */ endif #ifndef WITH_MPI startAddr = ubound(hh_trans_complex,dim=2) #endif #ifdef WITH_OPENMP do istep=1,na-1-block_limits(my_pe)*nb #else do istep=1,na-1 #endif if (my_pe==0) then n = MIN(na-na_s,nb) ! number of rows to be reduced #ifdef DOUBLE_PRECISION_COMPLEX hv(:) = 0._ck8 tau = 0._ck8 #else hv(:) = 0._ck4 tau = 0._ck4 #endif ! Transform first column of remaining matrix ! Opposed to the real case, the last step (istep=na-1) is needed here for making ! the last subdiagonal element a real number #ifdef DOUBLE_PRECISION_COMPLEX vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk8)**2+dimag(ab(3:n+1,na_s-n_off))**2) #else vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk4)**2+aimag(ab(3:n+1,na_s-n_off))**2) #endif if (n<2) vnorm2 = 0. ! Safety only #ifdef DOUBLE_PRECISION_COMPLEX call hh_transform_complex_double(ab(2,na_s-n_off),vnorm2,hf,tau) #else call hh_transform_complex_single(ab(2,na_s-n_off),vnorm2,hf,tau) #endif #ifdef DOUBLE_PRECISION_COMPLEX hv(1) = 1._ck8 #else hv(1) = 1._ck4 #endif hv(2:n) = ab(3:n+1,na_s-n_off)*hf d(istep) = ab(1,na_s-n_off) e(istep) = ab(2,na_s-n_off) if (istep == na-1) then d(na) = ab(1,na_s+1-n_off) #ifdef DOUBLE__PRECISION_COMPLEX e(na) = 0._rk8 #else e(na) = 0._rk4 #endif endif else if (na>na_s) then ! Receive Householder vector from previous task, from PE owning subdiagonal #ifdef WITH_OPENMP #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call mpi_recv(hv, nb, MPI_COMPLEX16, my_pe-1, 2, mpi_comm, mpi_status, mpierr) #else call mpi_recv(hv, nb, MPI_COMPLEX8, my_pe-1, 2, mpi_comm, mpi_status, mpierr) #endif #else /* WITH_MPI */ hv(1:nb) = hv_s(1:nb) #endif /* WITH_MPI */ #else /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call mpi_recv(hv, nb, MPI_COMPLEX16, my_pe-1, 2, mpi_comm, MPI_STATUS_IGNORE, mpierr) #else call mpi_recv(hv, nb, MPI_COMPLEX8, my_pe-1, 2, mpi_comm, MPI_STATUS_IGNORE, mpierr) #endif #else /* WITH_MPI */ hv(1:nb) = hv_s(1:nb) #endif /* WITH_MPI */ #endif /* WITH_OPENMP */ tau = hv(1) #ifdef DOUBLE_PRECISION_COMPLEX hv(1) = 1._ck8 #else hv(1) = 1._ck4 #endif endif endif na_s = na_s+1 if (na_s-n_off > nb) then !#ifdef WITH_GPU_VERSION ! ab_temp(:,1:nblocks*nb) = ab(:,nb+1:(nblocks +1)*nb) ! ab(:, 1:nblocks*nb) = ab_temp(:, 1:nblocks*nb) !#else ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb) !#endif ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0 n_off = n_off + nb endif #ifdef WITH_OPENMP if (max_threads > 1) then ! Codepath for OpenMP ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread! ! Every thread is one reduction cycle behind its predecessor and thus starts one step later. ! This simulates the behaviour of the MPI tasks which also work after each other. ! The code would be considerably easier, if the MPI communication would be made within ! the parallel region - this is avoided here since this would require ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program. hv_t(:,1) = hv tau_t(1) = tau do iter = 1, 2 ! iter=1 : work on first block ! iter=2 : work on remaining blocks ! This is done in 2 iterations so that we have a barrier in between: ! After the first iteration, it is guaranteed that the last row of the last block ! is completed by the next thread. ! After the first iteration it is also the place to exchange the last row ! with MPI calls #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%start("OpenMP parallel_double") #else call timer%start("OpenMP parallel_single") #endif #endif !$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, & !$omp& nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads) do my_thread = 1, max_threads if (iter == 1) then my_block_s = omp_block_limits(my_thread-1) + 1 my_block_e = my_block_s else my_block_s = omp_block_limits(my_thread-1) + 2 my_block_e = omp_block_limits(my_thread) endif do iblk = my_block_s, my_block_e ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block ne = ns+nb-1 ! last column in block if (istepna) exit hv = hv_t(:,my_thread) tau = tau_t(my_thread) ! Store Householder vector for back transformation hh_cnt(iblk) = hh_cnt(iblk) + 1 hh_gath(1 ,hh_cnt(iblk),iblk) = tau hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb) nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) ! Note that nr>=0 implies that diagonal block is full (nc==nb)! ! Transform diagonal block #ifdef DOUBLE_PRECISION_COMPLEX call ZHEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, (0.0_rk8, 0.0_rk8), hd, 1) #else call CHEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, (0.0_rk4, 0.0_rk4), hd, 1) #endif x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau) hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc) #ifdef DOUBLE_PRECISION_COMPLEX call ZHER2('L', nc, (-1.0_rk8, 0.0_rk8), hd, 1, hv, 1, ab(1,ns), 2*nb-1) #else call CHER2('L', nc, (-1.0_rk4, 0.0_rk4), hd, 1, hv, 1, ab(1,ns), 2*nb-1) #endif #ifdef DOUBLE_PRECISION_COMPLEX hv_t(:,my_thread) = 0._ck8 tau_t(my_thread) = 0._ck8 #else hv_t(:,my_thread) = 0._ck4 tau_t(my_thread) = 0._ck4 #endif if (nr<=0) cycle ! No subdiagonal block present any more ! Transform subdiagonal block #ifdef DOUBLE_PRECISION_COMPLEX call ZGEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, (0.0_rk8,0.0_rk8), hs, 1) #else call CGEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, (0.0_rk4,0.0_rk4), hs, 1) #endif if (nr>1) then ! complete (old) Householder transformation for first column ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1 ! calculate new Householder transformation for first column ! (stored in hv_t(:,my_thread) and tau_t(my_thread)) #ifdef DOUBLE_PRECISION_COMPLEX vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2) #else vnorm2 = sum(real(ab(nb+2:nb+nr,ns))**2+aimag(ab(nb+2:nb+nr,ns))**2) #endif #ifdef DOUBLE_PRECISION_COMPLEX call hh_transform_complex(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread)) #else call hh_transform_complex_single(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread)) #endif #ifdef DOUBLE_PRECISION_COMPLEX hv_t(1 ,my_thread) = 1._ck8 hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf ab(nb+2:,ns) = 0._ck8 #else hv_t(1 ,my_thread) = 1._ck4 hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf ab(nb+2:,ns) = 0._ck4 #endif ! update subdiagonal block for old and new Householder transformation ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster #ifdef DOUBLE_PRECISION_COMPLEX call ZGEMV('C', nr, nb-1, tau_t(my_thread), ab(nb,ns+1), 2*nb-1, hv_t(1,my_thread), 1, (0.0_rk8,0.0_rk8), h(2), 1) #else call CGEMV('C', nr, nb-1, tau_t(my_thread), ab(nb,ns+1), 2*nb-1, hv_t(1,my_thread), 1, (0.0_rk4,0.0_rk4), h(2), 1) #endif x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread) h(2:nb) = h(2:nb) - x*hv(2:nb) ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2") do i=2,nb ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) & - hv_t(1:nr,my_thread)*conjg(h(i)) - hs(1:nr)*conjg(hv(i)) enddo else ! No new Householder transformation for nr=1, just complete the old one ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1 do i=2,nb ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i)) enddo ! For safety: there is one remaining dummy transformation (but tau is 0 anyways) #ifdef DOUBLE_PRECISION_COMPLEX hv_t(1,my_thread) = 1._ck8 #else hv_t(1,my_thread) = 1._ck4 #endif endif enddo enddo ! my_thread !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%stop("OpenMP parallel_double") #else call timer%stop("OpenMP parallel_single") #endif #endif if (iter==1) then ! We are at the end of the first block ! Send our first column to previous PE if (my_pe>0 .and. na_s <= na) then #ifdef WITH_MPI call mpi_wait(ireq_ab,mpi_status,mpierr) #endif ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off) #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call mpi_isend(ab_s, nb+1, MPI_COMPLEX16, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) #else call mpi_isend(ab_s, nb+1, MPI_COMPLEX8, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) #endif #endif /* WITH_MPI */ endif ! Request last column from next PE ne = na_s + nblocks*nb - (max_threads-1) - 1 if (istep>=max_threads .and. ne <= na) then #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call mpi_recv(ab(1,ne-n_off), nb+1, MPI_COMPLEX16, my_pe+1, 1, mpi_comm, mpi_status, mpierr) #else call mpi_recv(ab(1,ne-n_off), nb+1, MPI_COMPLEX8, my_pe+1, 1, mpi_comm, mpi_status, mpierr) #endif #else /* WITH_MPI */ ab(1:nb+1,ne-n_off) = ab_s(1:nb+1) #endif /* WITH_MPI */ endif else ! We are at the end of all blocks ! Send last HH vector and TAU to next PE if it has been calculated above ne = na_s + nblocks*nb - (max_threads-1) - 1 if (istep>=max_threads .and. ne < na) then #ifdef WITH_MPI call mpi_wait(ireq_hv,mpi_status,mpierr) #endif hv_s(1) = tau_t(max_threads) hv_s(2:) = hv_t(2:,max_threads) #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call mpi_isend(hv_s, nb, MPI_COMPLEX16, my_pe+1, 2, mpi_comm, ireq_hv, mpierr) #else call mpi_isend(hv_s, nb, MPI_COMPLEX8, my_pe+1, 2, mpi_comm, ireq_hv, mpierr) #endif #endif /* WITH_MPI */ endif ! "Send" HH vector and TAU to next OpenMP thread do my_thread = max_threads, 2, -1 hv_t(:,my_thread) = hv_t(:,my_thread-1) tau_t(my_thread) = tau_t(my_thread-1) enddo endif enddo ! iter else ! Codepath for 1 thread without OpenMP ! The following code is structured in a way to keep waiting times for ! other PEs at a minimum, especially if there is only one block. ! For this reason, it requests the last column as late as possible ! and sends the Householder vector and the first column as early ! as possible. #endif /* WITH_OPENMP */ !#ifdef WITH_GPU_VERSION ! call cpu_time(start) !#endif do iblk=1,nblocks ns = na_s + (iblk-1)*nb - n_off ! first column in block ne = ns+nb-1 ! last column in block if (ns+n_off>na) exit ! Store Householder vector for back transformation hh_cnt(iblk) = hh_cnt(iblk) + 1 hh_gath(1 ,hh_cnt(iblk),iblk) = tau hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb) #ifndef WITH_OPENMP if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then ! Wait for last transfer to finish #ifdef WITH_MPI call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr) #endif ! Copy vectors into send buffer hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk) ! Send to destination #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), MPI_COMPLEX16, & global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1,np_cols)), & 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) #else call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), MPI_COMPLEX8, & global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1,np_cols)), & 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) #endif #else /* WITH_MPI */ startAddr = startAddr - hh_cnt(iblk) hh_trans_complex(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk) #endif /* WITH_MPI */ ! Reset counter and increase destination row hh_cnt(iblk) = 0 hh_dst(iblk) = hh_dst(iblk)+1 endif ! The following code is structured in a way to keep waiting times for ! other PEs at a minimum, especially if there is only one block. ! For this reason, it requests the last column as late as possible ! and sends the Householder vector and the first column as early ! as possible. #endif /* OpenMP */ nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) ! Note that nr>=0 implies that diagonal block is full (nc==nb)! ! Multiply diagonal block and subdiagonal block with Householder vector if (iblk==nblocks .and. nc==nb) then ! We need the last column from the next PE. ! First do the matrix multiplications without last column ... ! Diagonal block, the contribution of the last element is added below #ifdef DOUBLE_PRECISION_COMPLEX ab(1,ne) = 0._ck8 call ZHEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1,(0.0_rk8,0.0_rk8),hd,1) ! Subdiagonal block if (nr>0) call ZGEMV('N', nr, nb-1, tau, ab(nb+1,ns), 2*nb-1, hv, 1,(0.0_rk8,0.0_rk8),hs,1) ! ... then request last column ... ! ... then request last column ... #ifdef WITH_MPI #ifdef WITH_OPENMP call mpi_recv(ab(1,ne), nb+1, MPI_COMPLEX16, my_pe+1, 1, mpi_comm, mpi_status, mpierr) #else call mpi_recv(ab(1,ne), nb+1, MPI_COMPLEX16, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr) #endif #else /* WITH_MPI */ ab(1:nb+1,ne) = ab_s(1:nb+1) #endif /* WITH_MPI */ ! ... and complete the result hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb) hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau else ! Normal matrix multiply call ZHEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, (0.0_rk8,0.0_rk8), hd, 1) if (nr>0) call ZGEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, (0.0_rk8,0.0_rk8), hs, 1) endif #else /* DOUBLE_PRECISION_COMPLEX */ ab(1,ne) = 0._ck4 call CHEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1,(0.0_rk4,0.0_rk4),hd,1) ! Subdiagonal block if (nr>0) call CGEMV('N', nr, nb-1, tau, ab(nb+1,ns), 2*nb-1, hv, 1,(0.0_rk4,0.0_rk4),hs,1) ! ... then request last column ... #ifdef WITH_MPI #ifdef WITH_OPENMP call mpi_recv(ab(1,ne), nb+1, MPI_COMPLEX8, my_pe+1, 1, mpi_comm, mpi_status, mpierr) #else call mpi_recv(ab(1,ne), nb+1, MPI_COMPLEX8, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr) #endif #else /* WITH_MPI */ ab(1:nb+1,ne) = ab_s(1:nb+1) #endif /* WITH_MPI */ ! ... and complete the result hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb) hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau else ! Normal matrix multiply call CHEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, (0.0_rk4,0.0_rk4), hd, 1) if (nr>0) call CGEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, (0.0_rk4,0.0_rk4), hs, 1) endif #endif /* DOUBLE_PRECISION_COMPLEX */ ! Calculate first column of subdiagonal block and calculate new ! Householder transformation for this column hv_new(:) = 0 ! Needed, last rows must be 0 for nr < nb !#ifdef WITH_GPU_VERSION ! istat = cuda_memset(hv_new_dev, 0,nb*size_of_complex_datatype ) ! if (istat .ne. 0) print *, " cuda memset failed hv_new_dev", istat !#endif tau_new = 0 if (nr>0) then ! complete (old) Householder transformation for first column ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1 ! calculate new Householder transformation ... if (nr>1) then #ifdef DOUBLE_PRECISION_COMPLEX vnorm2 = sum(real(ab(nb+2:nb+nr,ns),kind=rk8)**2+dimag(ab(nb+2:nb+nr,ns))**2) #else vnorm2 = sum(real(ab(nb+2:nb+nr,ns),kind=rk4)**2+aimag(ab(nb+2:nb+nr,ns))**2) #endif #ifdef DOUBLE_PRECISION_COMPLEX call hh_transform_complex_double(ab(nb+1,ns),vnorm2,hf,tau_new) #else call hh_transform_complex_single(ab(nb+1,ns),vnorm2,hf,tau_new) #endif #ifdef DOUBLE_PRECISION_COMPLEX hv_new(1) = 1._ck8 hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf ab(nb+2:,ns) = 0._ck8 #else hv_new(1) = 1._ck4 hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf ab(nb+2:,ns) = 0._ck4 #endif endif ! ... and send it away immediatly if this is the last block if (iblk==nblocks) then #ifdef WITH_MPI #ifdef WITH_OPENMP call mpi_wait(ireq_hv,mpi_status,mpierr) #else call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) #endif #endif /* WITH_MPI */ hv_s(1) = tau_new hv_s(2:) = hv_new(2:) #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call mpi_isend(hv_s, nb, MPI_COMPLEX16, my_pe+1, 2 ,mpi_comm, ireq_hv, mpierr) #else call mpi_isend(hv_s, nb, MPI_COMPLEX8, my_pe+1, 2 ,mpi_comm, ireq_hv, mpierr) #endif #endif /* WITH_MPI */ endif endif ! Transform diagonal block x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau) #ifdef DOUBLE_PRECISION_COMPLEX hd(1:nc) = hd(1:nc) - 0.5_rk8*x*hv(1:nc) #else hd(1:nc) = hd(1:nc) - 0.5_rk4*x*hv(1:nc) #endif !#ifdef WITH_GPU_VERSION ! istat = cuda_memcpy2d((ab_dev + (ns-1)*2*nb*size_of_complex_datatype), 2*nb*size_of_complex_datatype,loc(a(1,ns)), 2*nb*size_of_complex_datatype, 2*size_of_complex_datatype , & ! 2*nb*size_of_complex_datatype,cudaMemcpyHostToDevice) ! if (istat .ne. 0) print *, "cuda memcpy a_dev H2D failed ", istat ! istat =cuda_memcpy(hv_dev,loc(hv),nc*size_of_complex_datatype,cudaMemcpyHostToDevice) ! if (istat .ne. 0) print *,"cuda memcpy failed hv_dev", istat ! istat =cuda_memcpy(hd_dev,loc(hd), nb*size_of_complex_datatype,cudaMemcpyHostToDevice) ! if (istat .ne. 0) print *,"cuda memcpy failed hd_dev", istat !#endif if (my_pe>0 .and. iblk==1) then ! The first column of the diagonal block has to be send to the previous PE ! Calculate first column only ... !#ifdef WITH_GPU_VERSION ! call double_hh_transform_2( ns, nc, nb ) ! istat=cuda_memcpy(loc(ab),ab_dev,(2*nb*(nblocks+1)*nb)*size_of_complex_datatype,cudaMemcpyDeviceToHost) ! if (istat .ne. 0) print *, " cuda memcpy failed ab ", istat !#else ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*conjg(hv(1)) - hv(1:nc)*conjg(hd(1)) !#endif ! ... send it away ... #ifdef WITH_MPI #ifdef WITH_OPENMP call mpi_wait(ireq_ab,mpi_status,mpierr) #else call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) #endif #endif /* WITH_MPI */ ab_s(1:nb+1) = ab(1:nb+1,ns) #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call mpi_isend(ab_s, nb+1, MPI_COMPLEX16, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) #else call mpi_isend(ab_s, nb+1, MPI_COMPLEX8, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) #endif #endif /* WITH_MPI */ ! ... and calculate remaining columns with rank-2 update if (nc>1) then !#ifdef WITH_GPU_VERSION ! call cublas_ZHER2( 'L',nc -1,(-1.d0,0.d0), hd_dev + 1*16, 1, hv_dev +1*16, 1 , ab_dev + (ns*2*nb )*16, 2*nb-1) !#else #ifdef DOUBLE_PRECISION_COMPLEX call ZHER2('L', nc-1, (-1.0_rk8,0.0_rk8), hd(2), 1, hv(2), 1, ab(1,ns+1), 2*nb-1) #else call CHER2('L', nc-1, (-1.0_rk4,0.0_rk4), hd(2), 1, hv(2), 1, ab(1,ns+1), 2*nb-1) #endif !#endif endif else ! No need to send, just a rank-2 update !#ifdef WITH_GPU_VERSION ! call cublas_ZHER2( 'L',nc ,(-1.d0,0.d0), hd_dev, 1, hv_dev, 1 , ab_dev + ((ns-1)*2*nb )*16, 2*nb-1) !#else #ifdef DOUBLE_PRECISION_COMPLEX call ZHER2('L', nc, (-1.0_rk8,0.0_rk8), hd, 1, hv, 1, ab(1,ns), 2*nb-1) #else call CHER2('L', nc, (-1.0_rk4,0.0_rk4), hd, 1, hv, 1, ab(1,ns), 2*nb-1) #endif !#endif endif !#ifdef WITH_GPU_VERSION ! istat=cuda_memcpy( loc(hd),hd_dev,nb*size_of_complex_datatype,cudaMemcpyDeviceToHost) ! if (istat .ne. 0) print *,"cuda memcpy failed hd_dev", istat !#endif ! Do the remaining double Householder transformation on the subdiagonal block cols 2 ... nb !#ifdef WITH_GPU_VERSION ! istat =cuda_memcpy(hs_dev,loc(hs),nb*size_of_complex_datatype,cudaMemcpyHostToDevice) ! if (istat .ne. 0) print *,"cuda memcpy failed hs_dev", istat !#endif if (nr>0) then if (nr>1) then !#ifdef WITH_GPU_VERSION ! istat = cuda_memcpy(hv_new_dev,loc(hv_new),nb*size_of_complex_datatype,cudaMemcpyHostToDevice) ! if (istat .ne. 0) print *,"cuda memcpy failed hv_new_dev", istat ! ! istat = cuda_memcpy(h_dev,loc(h),nb*size_of_complex_datatype,cudaMemcpyHostToDevice) ! if (istat .ne. 0) print *,"cuda memcpy failed h_dev", istat ! ! call cublas_ZGEMV('C',nr,nb-1,tau_new,ab_dev + (nb-1 + ns *2*nb)*16,2*nb-1,hv_new_dev,1,(0.d0,0.d0),h_dev + 1* 16,1) ! ! istat = cuda_memcpy(tau_new_dev,loc(tau_new),1*size_of_complex_datatype,cudaMemcpyHostToDevice) ! if (istat .ne. 0) print *,"cuda memcpy failed tau_new_dev", istat ! ! call dot_product_kernel(nr , tau_new) ! call dot_product_kernel_1( nb, nr , ns) ! ! istat = cuda_memcpy(loc(x),x_dev,1*size_of_complex_datatype,cudaMemcpyDeviceToHost) ! if (istat .ne. 0) print *, " cuda memcpy failed x_dev ", istat ! ! istat =cuda_memcpy(loc(h),h_dev,nb*size_of_complex_datatype,cudaMemcpyDeviceToHost) ! if (istat .ne. 0) print *, " cuda memcpy failed h ", istat !#else #ifdef DOUBLE_PRECISION_COMPLEX call ZGEMV('C', nr, nb-1, tau_new, ab(nb,ns+1), 2*nb-1, hv_new, 1, (0.0_rk8, 0.0_rk8), h(2), 1) #else call CGEMV('C', nr, nb-1, tau_new, ab(nb,ns+1), 2*nb-1, hv_new, 1, (0.0_rk4, 0.0_rk4), h(2), 1) #endif x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new h(2:nb) = h(2:nb) - x*hv(2:nb) ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update do i=2,nb ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*conjg(h(i)) - hs(1:nr)*conjg(hv(i)) enddo !#endif else ! No double Householder transformation for nr=1, just complete the row !#ifdef WITH_GPU_VERSION ! call double_hh_transform_1(nb, ns) !#else do i=2,nb ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i)) enddo !#endif endif endif ! Use new HH vector for the next block hv(:) = hv_new(:) tau = tau_new enddo !#ifdef WITH_GPU_VERSION ! call cpu_time(finish) ! tstep2 = finish-start ! t_2 = t_2 + tstep2 !#endif #ifdef WITH_OPENMP endif #endif #ifdef WITH_OPENMP do iblk = 1, nblocks if (hh_dst(iblk) >= np_rows) exit if (snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then ! Wait for last transfer to finish #ifdef WITH_MPI call mpi_wait(ireq_hhs(iblk), mpi_status, mpierr) #endif ! Copy vectors into send buffer hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk) ! Send to destination #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_complex16, & global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1, np_cols)), & 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) #else call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_complex8, & global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1, np_cols)), & 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) #endif #else /* WITH_MPI */ startAddr = startAddr - hh_cnt(iblk) hh_trans_complex(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk) #endif /* WITH_MPI */ ! Reset counter and increase destination row hh_cnt(iblk) = 0 hh_dst(iblk) = hh_dst(iblk)+1 endif enddo #endif /* WITH_OPENMP */ enddo !#ifdef WITH_GPU_VERSION ! call cpu_time(finish_1) ! tstep1 = finish_1-start_1 ! t_1 = t_1 + tstep1 !#endif ! Finish the last outstanding requests #ifdef WITH_OPENMP #ifdef WITH_MPI call mpi_wait(ireq_ab,mpi_status,mpierr) call mpi_wait(ireq_hv,mpi_status,mpierr) allocate(mpi_statuses(MPI_STATUS_SIZE,max(nblocks,num_chunks)), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating mpi_statuses "//errorMessage stop endif call mpi_waitall(nblocks, ireq_hhs, mpi_statuses, mpierr) call mpi_waitall(num_chunks, ireq_hhr, mpi_statuses, mpierr) deallocate(mpi_statuses, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when deallocating mpi_statuses "//errorMessage stop endif #endif /* WITH_MPI */ #else /* WITH_OPENMP */ #ifdef WITH_MPI call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr) call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr) #endif #endif /* WITH_OPENMP */ #ifdef WITH_MPI call mpi_barrier(mpi_comm,mpierr) #endif deallocate(ab, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when deallocating ab "//errorMessage stop endif !#ifdef WITH_GPU_VERSION ! deallocate(ab_temp, stat=istat, errmsg=errorMessage) ! if (istat .ne. 0) then ! print *,"error when deallocating ab_temp "//errorMessage ! stop ! endif ! !#endif deallocate(ireq_hhr, ireq_hhs, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when deallocating ireq_hhr, ireq_hhs "//errorMessage stop endif deallocate(hh_cnt, hh_dst, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when deallocating hh_cnt, hh_dst "//errorMessage stop endif deallocate(hh_gath, hh_send, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when deallocating hh_gath, hh_send, "//errorMessage stop endif deallocate(limits, snd_limits, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when deallocating limits, snd_limits "//errorMessage stop endif deallocate(block_limits, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when deallocating block_limits, "//errorMessage stop endif deallocate(global_id, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when deallocating global_id, "//errorMessage stop endif !#ifdef WITH_GPU_VERSION ! istat = cuda_free(ab_dev) ! if (istat .ne. 0) then ! print *,"error in cudaFree" ! stop ! endif ! ! istat = cuda_free(hv_new_dev) ! if (istat .ne. 0) then ! print *,"error in cudaFree" ! stop ! endif ! ! istat = cuda_free(hs_dev) ! if (istat .ne. 0) then ! print *,"error in cudaFree" ! stop ! endif ! ! istat = cuda_free(h_dev) ! if (istat .ne. 0) then ! print *,"error in cudaFree" ! stop ! endif ! ! istat = cuda_free(tau_new_dev) ! if (istat .ne. 0) then ! print *,"error in cudaFree" ! stop ! endif ! ! istat = cuda_free(x_dev) ! if (istat .ne. 0) then ! print *,"error in cudaFree" ! stop ! endif ! !#endif #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%stop("tridiag_band_complex_double") #else call timer%stop("tridiag_band_complex_single") #endif #endif !#ifdef WITH_GPU_VERSION ! contains ! ! subroutine dot_product_kernel(nr,tau_new) ! implicit none ! integer, intent(in) :: nr ! complex*16, intent(in) :: tau_new ! ! call launch_dot_product_kernel( hs_dev,hv_new_dev,tau_new,x_dev,h_dev,hv_dev, nr ) ! end subroutine ! ! subroutine dot_product_kernel_1( nb , nr , ns) ! implicit none ! integer, intent(in) :: nb, nr, ns ! ! call launch_dot_product_kernel_1(ab_dev,hs_dev, hv_new_dev,x_dev,h_dev,hv_dev,nb , nr, ns) ! end subroutine ! ! subroutine double_hh_transform_1( nb , ns) ! implicit none ! integer, intent(in) :: nb, ns ! ! call launch_double_hh_transform_1(ab_dev,hs_dev,hv_dev,nb , ns) ! end subroutine ! ! subroutine double_hh_transform_2( ns,nc, nb) ! implicit none ! integer, intent(in) :: nc, ns, nb ! ! call launch_double_hh_transform_2(ab_dev,hd_dev,hv_dev,nc , ns, nb) ! end subroutine !#endif #ifdef DOUBLE_PRECISION_COMPLEX end subroutine tridiag_band_complex_double ! has to be checked for GPU #else end subroutine tridiag_band_complex_single ! has to be checked for GPU #endif #define ATODEV istat = cuda_memcpy(loc(a), a_dev, stripe_width*a_dim2*stripe_count*size_of_complex_datatype, cudaMemcpyDeviceToHost) #define ATOHOST istat = cuda_memcpy(a_dev, loc(a), stripe_width*a_dim2*stripe_count*size_of_complex_datatype, cudaMemcpyDeviceToHost) #ifdef DOUBLE_PRECISION_COMPLEX subroutine trans_ev_tridi_to_band_complex_double(na, nev, nblk, nbw, q, ldq, matrixCols, & hh_trans_complex, mpi_comm_rows, mpi_comm_cols, & wantDebug, useGPU, success, THIS_COMPLEX_ELPA_KERNEL) #else subroutine trans_ev_tridi_to_band_complex_single(na, nev, nblk, nbw, q, ldq, matrixCols, & hh_trans_complex, mpi_comm_rows, mpi_comm_cols, & wantDebug, useGPU, success, THIS_COMPLEX_ELPA_KERNEL) #endif !------------------------------------------------------------------------------- ! trans_ev_tridi_to_band_complex: ! Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix ! ! Parameters ! ! na Order of matrix a, number of rows of matrix q ! ! nev Number eigenvectors to compute (= columns of matrix q) ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! nb semi bandwith ! ! q On input: Eigenvectors of tridiagonal matrix ! On output: Transformed eigenvectors ! Distribution is like in Scalapack. ! ! ldq Leading dimension of q ! matrixCols local columns of matrix q ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns/both ! !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use pack_unpack_complex use compute_hh_trafo_complex use precision use cuda_functions use iso_c_binding implicit none logical, intent(in) :: useGPU integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=COMPLEX_DATATYPE) :: q(ldq,*) #else complex(kind=COMPLEX_DATATYPE) :: q(ldq,matrixCols) #endif complex(kind=COMPLEX_DATATYPE) :: hh_trans_complex(:,:) integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol integer(kind=ik) :: tmp integer(kind=ik) :: i, j, ip, sweep, nbuf, l_nev, a_dim2 integer(kind=ik) :: current_n, current_local_n, current_n_start, current_n_end integer(kind=ik) :: next_n, next_local_n, next_n_start, next_n_end integer(kind=ik) :: bottom_msg_length, top_msg_length, next_top_msg_length integer(kind=ik) :: stripe_width, last_stripe_width, stripe_count #ifdef WITH_OPENMP integer(kind=ik) :: thread_width, csw, b_off, b_len #endif integer(kind=ik) :: num_result_blocks, num_result_buffers, num_bufs_recvd integer(kind=ik) :: a_off, current_tv_off, max_blk_size integer(kind=ik) :: mpierr, src, src_offset, dst, offset, nfact, num_blk logical :: flag #ifdef WITH_OPENMP complex(kind=COMPLEX_DATATYPE), pointer :: a(:,:,:,:) #else complex(kind=COMPLEX_DATATYPE), pointer :: a(:,:,:) #endif complex(kind=COMPLEX_DATATYPE) :: a_complex type(c_ptr) :: a_ptr complex(kind=COMPLEX_DATATYPE), allocatable :: row(:) complex(kind=COMPLEX_DATATYPE), allocatable :: row_group(:,:) #ifdef WITH_OPENMP complex(kind=COMPLEX_DATATYPE), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:) complex(kind=COMPLEX_DATATYPE), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:) #else complex(kind=COMPLEX_DATATYPE), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:) complex(kind=COMPLEX_DATATYPE), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:) #endif integer(kind=c_intptr_t) :: a_dev integer(kind=c_intptr_t) :: bcast_buffer_dev integer(kind=c_size_t) :: num integer(kind=c_size_t) :: dev_offset, dev_offset_1, dev_offset_2 integer(kind=c_intptr_t) :: row_dev integer(kind=c_intptr_t) :: row_group_dev integer(kind=c_intptr_t) :: hh_tau_dev integer(kind=c_intptr_t) :: hh_dot_dev integer(kind=ik) :: row_group_size, unpack_idx integer(kind=ik) :: n_times integer(kind=ik) :: top, chunk, this_chunk complex(kind=COMPLEX_DATATYPE), allocatable :: result_buffer(:,:,:) complex(kind=COMPLEX_DATATYPE), allocatable :: bcast_buffer(:,:) integer(kind=ik) :: n_off integer(kind=ik), allocatable :: result_send_request(:), result_recv_request(:), limits(:) integer(kind=ik), allocatable :: top_send_request(:), bottom_send_request(:) integer(kind=ik), allocatable :: top_recv_request(:), bottom_recv_request(:) #ifdef WITH_OPENMP integer(kind=ik), allocatable :: mpi_statuses(:,:) #ifdef WITH_MPI integer(kind=ik) :: mpi_status(MPI_STATUS_SIZE) #endif #endif #ifdef WITH_MPI integer(kind=ik), external :: numroc #endif integer(kind=ik) :: na_rows, na_cols ! real*8 :: ttt0, ttt1, ttt2, t2_compute_kernel, t0_compute_kernel,t1_compute_kernel, & ! t0_mpi_time, t1_mpi_time,t2_mpi_time ! real*8 :: t0_cpu_code,t1_cpu_code,t2_cpu_code,t0_block_time,t1_block_time,t2_block_time,t0_cuda_memcpy ! real*8 :: t0_inner_do_time, t1_inner_do_time , t2_inner_do_time,t0_outer_do_time ,t1_outer_do_time , & ! t2_outer_do_time ,t0_result_time ,t1_result_time, t2_result_time,t0_mpi_recv_time, & ! t1_mpi_recv_time,t2_mpi_recv_time ! real*8 :: t1_mpi_wait_time,t0_mpi_wait_time,t2_mpi_wait_time,t1_memcpy_time,t0_memcpy_time,t2_memcpy_time, & ! t1_mpi_irecv_time,t0_mpi_irecv_time,t2_mpi_irecv_time,t0_mpi_outer_wait_time,t1_mpi_outer_wait_time,& ! t2_mpi_outer_wait_time, time0 ! real*4 :: time1 ! MPI send/recv tags, arbitrary integer(kind=ik), parameter :: bottom_recv_tag = 111 integer(kind=ik), parameter :: top_recv_tag = 222 integer(kind=ik), parameter :: result_recv_tag = 333 #ifdef WITH_OPENMP integer(kind=ik) :: max_threads, my_thread integer(kind=ik) :: omp_get_max_threads #endif ! Just for measuring the kernel performance real(kind=c_double) :: kernel_time ! MPI_WTIME always needs double ! long integer integer(kind=lik) :: kernel_flops logical, intent(in) :: wantDebug integer(kind=ik) :: istat character(200) :: errorMessage logical :: successCUDA logical :: success #ifndef WITH_MPI integer(kind=ik) :: j1 #endif #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%start("trans_ev_tridi_to_band_complex_double") #else call timer%start("trans_ev_tridi_to_band_complex_single") #endif #endif if (useGPU) then n_times =0 ! n_times_1 =0 unpack_idx = 0 row_group_size = 0 ! time0=0 ! t0_compute_kernel=0 endif kernel_time = 0.0 kernel_flops = 0 #ifdef WITH_OPENMP max_threads = 1 max_threads = omp_get_max_threads() #endif call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr) call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr) call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr) call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr) if (useGPU) then #ifdef WITH_MPI na_rows = numroc(na, nblk, my_prow, 0, np_rows) na_cols = numroc(na, nblk, my_pcol, 0, np_cols) #else na_rows = na na_cols = na #endif endif success = .true. if (mod(nbw,nblk)/=0) then if (my_prow==0 .and. my_pcol==0) then if (wantDebug) then write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: ERROR: nbw=',nbw,', nblk=',nblk write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: band backtransform works only for nbw==n*nblk' endif success = .false. return endif endif nfact = nbw / nblk ! local number of eigenvectors l_nev = local_index(nev, my_pcol, np_cols, nblk, -1) if (l_nev==0) then #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif thread_width = 0 #endif stripe_width = 0 stripe_count = 0 last_stripe_width = 0 else ! Suggested stripe width is 48 - should this be reduced for the complex case ??? #ifdef WITH_OPENMP thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread #endif if (useGPU) then stripe_width = 256 else #ifdef DOUBLE_PRECISION_COMPLEX stripe_width = 48 ! Must be a multiple of 2 #else stripe_width = 48 ! Must be a multiple of 4 #endif endif #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif stripe_count = (thread_width-1)/stripe_width + 1 #else /* WITH_OPENMP */ stripe_count = (l_nev-1)/stripe_width + 1 #endif /* WITH_OPENMP */ ! Adapt stripe width so that last one doesn't get too small #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif stripe_width = (thread_width-1)/stripe_count + 1 #else /* WITH_OPENMP */ if (.not.(useGPU)) then stripe_width = (l_nev-1)/stripe_count + 1 endif #endif /* WITH_OPENMP */ if (.not.(useGPU)) then #ifdef DOUBLE_PRECISION_COMPLEX stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes ! (2 * sizeof(double complex) == 32) #else stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes ! (4 * sizeof(float complex) == 32) #endif endif #ifndef WITH_OPENMP last_stripe_width = l_nev - (stripe_count-1)*stripe_width #endif /* WITH_OPENMP */ endif ! Determine the matrix distribution at the beginning allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error when allocating limits "//errorMessage stop endif call determine_workload(na, nbw, np_rows, limits) max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1)) a_dim2 = max_blk_size + nbw !DEC$ ATTRIBUTES ALIGN: 64:: a #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif if (.not.(useGPU)) then if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*max_threads*C_SIZEOF(a_complex)) /= 0) then print *,"trans_ev_tridi_to_band_complex: error allocating a "//errorMessage stop endif call c_f_pointer(a_ptr, a, [stripe_width,a_dim2,stripe_count,max_threads] ) ! allocate(a(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage) ! a(:,:,:,:) should be set to 0 in a parallel region, not here! endif #else /* OpenMP */ if (.not.(useGPU)) then if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*C_SIZEOF(a_complex)) /= 0) then print *,"trans_ev_tridi_to_band_complex: error allocating a "//errorMessage stop endif call c_f_pointer(a_ptr, a, [stripe_width,a_dim2,stripe_count] ) ! allocate(a(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage) a(:,:,:) = 0 endif #endif /* WITH_OPENMP */ allocate(row(l_nev), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error allocating row "//errorMessage stop endif row(:) = 0 if (useGPU) then #ifdef DOUBLE_PRECISION_COMPLEX num = (stripe_width*a_dim2*stripe_count)*size_of_double_complex_datatype if (na_rows * na_cols .lt. stripe_width*a_dim2*stripe_count) then print *,"trans_ev_tridi_to_band_complex a_dev ",na_rows * na_cols, stripe_width*a_dim2*stripe_count ! stop endif successCUDA = cuda_malloc(a_dev, stripe_width*a_dim2*stripe_count*size_of_double_complex_datatype) #else num = (stripe_width*a_dim2*stripe_count)*size_of_single_complex_datatype if (na_rows * na_cols .lt. stripe_width*a_dim2*stripe_count) then print *,"trans_ev_tridi_to_band_complex a_dev ",na_rows * na_cols, stripe_width*a_dim2*stripe_count ! stop endif successCUDA = cuda_malloc(a_dev, stripe_width*a_dim2*stripe_count*size_of_single_complex_datatype) #endif if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc " stop endif if (num .gt. na_rows * na_cols) then print *,"trans_ev_tridi_to_band_complex a_dev 1",num, na_rows * na_cols ! stop endif successCUDA = cuda_memset(a_dev , 0, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaMemset " stop endif #ifdef DOUBLE_PRECISION_COMPLEX num = (l_nev)*size_of_double_complex_datatype #else num = (l_nev)*size_of_single_complex_datatype #endif successCUDA = cuda_malloc( row_dev,num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc " stop endif successCUDA = cuda_memset(row_dev , 0, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaMemset " stop endif ! "row_group" and "row_group_dev" are needed for GPU optimizations allocate(row_group(l_nev, nblk), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error allocating row_group "//errorMessage stop endif #ifdef DOUBLE_PRECISION_COMPLEX row_group(:, :) = 0._ck8 #else row_group(:, :) = 0._ck4 #endif #ifdef DOUBLE_PRECISION_COMPLEX num = (l_nev*nblk)*size_of_double_complex_datatype #else num = (l_nev*nblk)*size_of_single_complex_datatype #endif successCUDA = cuda_malloc(row_group_dev, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc " stop endif successCUDA = cuda_memset(row_group_dev , 0, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaMemset " stop endif endif ! useGPU ! Copy q from a block cyclic distribution into a distribution with contiguous rows, ! and transpose the matrix using stripes of given stripe_width for cache blocking. ! The peculiar way it is done below is due to the fact that the last row should be ! ready first since it is the first one to start below #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif ! Please note about the OMP usage below: ! This is not for speed, but because we want the matrix a in the memory and ! in the cache of the correct thread (if possible) #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%start("OpenMP parallel_double") #else call timer%start("OpenMP parallel_single") #endif #endif !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads #ifdef DOUBLE_PRECISION_COMPLEX a(:,:,:,my_thread) = 0._ck8 ! if possible, do first touch allocation! #else a(:,:,:,my_thread) = 0._ck4 ! if possible, do first touch allocation! #endif enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%stop("OpenMP parallel_double") #else call timer%stop("OpenMP parallel_single") #endif #endif #endif /* WITH_OPENMP */ do ip = np_rows-1, 0, -1 if (my_prow == ip) then ! Receive my rows which have not yet been received src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1) do i=limits(ip)+1,limits(ip+1) src = mod((i-1)/nblk, np_rows) if (src < my_prow) then #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, mpi_status, mpierr) #else call MPI_Recv(row, l_nev, MPI_COMPLEX8, src, 0, mpi_comm_rows, mpi_status, mpierr) #endif #else /* WITH_MPI */ row(1:l_nev) = row(1:l_nev) #endif /* WITH_MPI */ #else /* WITH_OPENMP */ #ifdef DOUBLE_PRECISION_COMPLEX if (useGPU) then call unpack_and_prepare_row_group_complex_gpu_double(i - limits(ip), .false.) #ifdef WITH_MPI call MPI_Recv(row_group(:, row_group_size), l_nev,MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) #else row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct? #endif else #ifdef WITH_MPI call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) #else row(1:l_nev) = row(1:l_nev) #endif endif #else /* DOUBLE_PRECISION_COMPLEX */ if (useGPU) then call unpack_and_prepare_row_group_complex_gpu_single(i - limits(ip), .false.) #ifdef WITH_MPI call MPI_Recv(row_group(:, row_group_size), l_nev,MPI_COMPLEX8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) #else row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct? #endif else #ifdef WITH_MPI call MPI_Recv(row, l_nev, MPI_COMPLEX8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) #else row(1:l_nev) = row(1:l_nev) #endif endif #endif /* DOUBLE_PRECISION_COMPLEX */ #endif /* WITH_OPENMP */ #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%start("OpenMP parallel_double") #else call timer%start("OpenMP parallel_single") #endif #endif !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads #ifdef DOUBLE_PRECISION_COMPLEX call unpack_row_complex_cpu_openmp_double(a, row,i-limits(ip),my_thread, & stripe_count, thread_width, stripe_width, l_nev) #else call unpack_row_complex_cpu_openmp_single(a, row,i-limits(ip),my_thread, & stripe_count, thread_width, stripe_width, l_nev) #endif enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%stop("OpenMP parallel_double") #else call timer%stop("OpenMP parallel_single") #endif #endif #else /* WITH_OPENMP */ if (.not.(useGPU)) then #ifdef DOUBLE_PRECISION_COMPLEX call unpack_row_complex_cpu_double(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width) #else call unpack_row_complex_cpu_single(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width) #endif endif #endif /* WITH_OPENMP */ elseif (src==my_prow) then src_offset = src_offset+1 if (useGPU) then #ifdef DOUBLE_PRECISION_COMPLEX call unpack_and_prepare_row_group_complex_gpu_double(i - limits(ip),.false.) #else call unpack_and_prepare_row_group_complex_gpu_single(i - limits(ip),.false.) #endif row_group(:, row_group_size) = q(src_offset, 1:l_nev) else row(:) = q(src_offset, 1:l_nev) endif #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%start("OpenMP parallel_double") #else call timer%start("OpenMP parallel_single") #endif #endif !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads #ifdef DOUBLE_PRECISION_COMPLEX call unpack_row_complex_cpu_openmp_double(a, row,i-limits(ip),my_thread, & stripe_count, thread_width, stripe_width, l_nev) #else call unpack_row_complex_cpu_openmp_single(a, row,i-limits(ip),my_thread, & stripe_count, thread_width, stripe_width, l_nev) #endif enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%stop("OpenMP parallel_double") #else call timer%stop("OpenMP parallel_single") #endif #endif #else /* WITH_OPENMP */ if (.not.(useGPU)) then #ifdef DOUBLE_PRECISION_COMPLEX call unpack_row_complex_cpu_double(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width) #else call unpack_row_complex_cpu_single(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width) #endif endif #endif /* WITH_OPENMP */ endif enddo ! Send all rows which have not yet been send src_offset = 0 do dst = 0, ip-1 do i=limits(dst)+1,limits(dst+1) if(mod((i-1)/nblk, np_rows) == my_prow) then src_offset = src_offset+1 row(:) = q(src_offset, 1:l_nev) #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call MPI_Send(row, l_nev, MPI_COMPLEX16, dst, 0, mpi_comm_rows, mpierr) #else call MPI_Send(row, l_nev, MPI_COMPLEX8, dst, 0, mpi_comm_rows, mpierr) #endif #endif /* WITH_MPI */ endif enddo enddo else if(my_prow < ip) then ! Send all rows going to PE ip src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1) do i=limits(ip)+1,limits(ip+1) src = mod((i-1)/nblk, np_rows) if (src == my_prow) then src_offset = src_offset+1 row(:) = q(src_offset, 1:l_nev) #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call MPI_Send(row, l_nev, MPI_COMPLEX16, ip, 0, mpi_comm_rows, mpierr) #else call MPI_Send(row, l_nev, MPI_COMPLEX8, ip, 0, mpi_comm_rows, mpierr) #endif #endif /* WITH_MPI */ endif enddo ! Receive all rows from PE ip do i=limits(my_prow)+1,limits(my_prow+1) src = mod((i-1)/nblk, np_rows) if (src == ip) then #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, mpi_status, mpierr) #else call MPI_Recv(row, l_nev, MPI_COMPLEX8, src, 0, mpi_comm_rows, mpi_status, mpierr) #endif #else /* WITH_MPI */ row(1:l_nev) = row(1:l_nev) #endif /* WITH_MPI */ #else /* WITH_OPENMP */ #ifdef DOUBLE_PRECISION_COMPLEX if (useGPU) then call unpack_and_prepare_row_group_complex_gpu_double(i - limits(my_prow), .false.) #ifdef WITH_MPI call MPI_Recv(row_group(:, row_group_size), l_nev,MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) #else row_group(1:l_nev,row_group_size) = row(1:l_nev) ! is this correct ? #endif else #ifdef WITH_MPI call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) #else row(1:l_nev) = row(1:l_nev) #endif endif #else /* DOUBLE_PRECISION_COMPLEX */ if (useGPU) then call unpack_and_prepare_row_group_complex_gpu_single(i - limits(my_prow), .false.) #ifdef WITH_MPI call MPI_Recv(row_group(:, row_group_size), l_nev,MPI_COMPLEX8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) #else row_group(1:l_nev,row_group_size) = row_group(1:l_nev,row_group_size) ! is this correct #endif else #ifdef WITH_MPI call MPI_Recv(row, l_nev, MPI_COMPLEX8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) #else row(1:l_nev) = row(1:l_nev) #endif endif #endif /* DOUBLE_PRECISION_COMPLEX */ #endif /* WITH_OPENMP */ #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%start("OpenMP parallel_double") #else call timer%start("OpenMP parallel_single") #endif #endif !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads #ifdef DOUBLE_PRECISION_COMPLEX call unpack_row_complex_cpu_openmp_double(a, row,i-limits(my_prow),my_thread, & stripe_count, thread_width, stripe_width, l_nev) #else call unpack_row_complex_cpu_openmp_single(a, row,i-limits(my_prow),my_thread, & stripe_count, thread_width, stripe_width, l_nev) #endif enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%stop("OpenMP parallel_double") #else call timer%stop("OpenMP parallel_single") #endif #endif #else /* WITH_OPENMP */ if (.not.(useGPU)) then #ifdef DOUBLE_PRECISION_COMPLEX call unpack_row_complex_cpu_double(a, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width) #else call unpack_row_complex_cpu_single(a, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width) #endif endif #endif /* WITH_OPENMP */ endif enddo endif enddo if (useGPU) then #ifdef DOUBLE_PRECISION_COMPLEX call unpack_and_prepare_row_group_complex_gpu_double(-1, .true.) #else call unpack_and_prepare_row_group_complex_gpu_single(-1, .true.) #endif successCUDA = cuda_devicesynchronize() if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaDeviceSynchronize" stop endif endif ! Set up result buffer queue num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows num_result_buffers = 4*nfact allocate(result_buffer(l_nev,nblk,num_result_buffers), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error allocating result_buffer "//errorMessage stop endif allocate(result_send_request(num_result_buffers), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error allocating result_send_request "//errorMessage stop endif allocate(result_recv_request(num_result_buffers), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error allocating result_recv_request "//errorMessage stop endif #ifdef WITH_MPI result_send_request(:) = MPI_REQUEST_NULL result_recv_request(:) = MPI_REQUEST_NULL #endif ! Queue up buffers #ifdef WITH_MPI if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends do j = 1, min(num_result_buffers, num_result_blocks) #ifdef DOUBLE_PRECISION_COMPLEX call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, MPI_COMPLEX16, 0, result_recv_tag, & mpi_comm_rows, result_recv_request(j), mpierr) #else call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, MPI_COMPLEX8, 0, result_recv_tag, & mpi_comm_rows, result_recv_request(j), mpierr) #endif enddo endif #else /* WITH_MPI */ ! carefull the "recieve" has to be done at the corresponding wait or send !if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends ! do j = 1, min(num_result_buffers, num_result_blocks) ! result_buffer(1:l_nev*nblk,1,j) = result_buffer(1:l_nev*nblk,1,nbuf) ! enddo !endif #endif /* WITH_MPI */ num_bufs_recvd = 0 ! No buffers received yet ! Initialize top/bottom requests allocate(top_send_request(stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error allocating top_send_request "//errorMessage stop endif allocate(top_recv_request(stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error allocating top_recv_request "//errorMessage stop endif allocate(bottom_send_request(stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error allocating bottom_send_request "//errorMessage stop endif allocate(bottom_recv_request(stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error allocating bottom_recv_request "//errorMessage stop endif #ifdef WITH_MPI top_send_request(:) = MPI_REQUEST_NULL top_recv_request(:) = MPI_REQUEST_NULL bottom_send_request(:) = MPI_REQUEST_NULL bottom_recv_request(:) = MPI_REQUEST_NULL #endif #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error allocating top_border_send_buffer "//errorMessage stop endif allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error allocating top_border_recv_buffer "//errorMessage stop endif allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error allocating bottom_border_send_buffer "//errorMessage stop endif allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error allocating bottom_border_recv_buffer "//errorMessage stop endif #ifdef DOUBLE_PRECISION_COMPLEX top_border_send_buffer(:,:) = 0._ck8 top_border_recv_buffer(:,:) = 0._ck8 bottom_border_send_buffer(:,:) = 0._ck8 bottom_border_recv_buffer(:,:) = 0._ck8 #else top_border_send_buffer(:,:) = 0._ck4 top_border_recv_buffer(:,:) = 0._ck4 bottom_border_send_buffer(:,:) = 0._ck4 bottom_border_recv_buffer(:,:) = 0._ck4 #endif #else /* OpenMP */ allocate(top_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error allocating top_border_send_buffer "//errorMessage stop endif allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error allocating top_border_recv_buffer "//errorMessage stop endif allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error allocating bottom_border_send_buffer "//errorMessage stop endif allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error allocating bottom_border_recv_buffer "//errorMessage stop endif #ifdef DOUBLE_PRECISION_COMPLEX top_border_send_buffer(:,:,:) = 0._ck8 top_border_recv_buffer(:,:,:) = 0._ck8 bottom_border_send_buffer(:,:,:) = 0._ck8 bottom_border_recv_buffer(:,:,:) = 0._ck8 #else top_border_send_buffer(:,:,:) = 0._ck4 top_border_recv_buffer(:,:,:) = 0._ck4 bottom_border_send_buffer(:,:,:) = 0._ck4 bottom_border_recv_buffer(:,:,:) = 0._ck4 #endif #endif /* WITH_OPENMP */ ! Initialize broadcast buffer allocate(bcast_buffer(nbw, max_blk_size), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error allocating bcast_buffer "//errorMessage stop endif bcast_buffer = 0 if (useGPU) then #ifdef DOUBLE_PRECISION_COMPLEX num = ( nbw * max_blk_size) * size_of_double_complex_datatype #else num = ( nbw * max_blk_size) * size_of_single_complex_datatype #endif successCUDA = cuda_malloc(bcast_buffer_dev, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc" stop endif successCUDA = cuda_memset( bcast_buffer_dev, 0, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaMemset" stop endif #ifdef DOUBLE_PRECISION_COMPLEX num = ((max_blk_size-1))*size_of_double_complex_datatype #else num = ((max_blk_size-1))*size_of_single_complex_datatype #endif successCUDA = cuda_malloc( hh_dot_dev, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc" stop endif successCUDA = cuda_memset( hh_dot_dev, 0, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaMemset" stop endif #ifdef DOUBLE_PRECISION_COMPLEX num = (max_blk_size)*size_of_double_complex_datatype #else num = (max_blk_size)*size_of_single_complex_datatype #endif successCUDA = cuda_malloc( hh_tau_dev, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc" stop endif successCUDA = cuda_memset( hh_tau_dev, 0, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaMemset" stop endif endif ! useGPU current_tv_off = 0 ! Offset of next row to be broadcast ! ------------------- start of work loop ------------------- a_off = 0 ! offset in A (to avoid unnecessary shifts) top_msg_length = 0 bottom_msg_length = 0 #ifdef WITH_GPU_VERSION !! istat = cuda_ProfilerStart() !! istat = cudaFuncSetCacheConfig ( launch_compute_hh_trafo_c_kernel_complex, cudaFuncCachePreferShared) !! t0_compute_kernel = 0 ! t0_mpi_time = 0 ! t0_cuda_memcpy =0 ! t0_cpu_code =0 ! t0_outer_do_time =0 ! t0_inner_do_time =0 ! t1_outer_do_time =MPI_Wtime() ! t0_block_time =0 ! t0_mpi_wait_time = 0 ! t0_memcpy_time = 0 ! t0_mpi_outer_wait_time=0 #endif do sweep = 0, (na-1)/nbw #ifdef WITH_GPU_VERSION ! t1_cpu_code =MPI_Wtime() #endif current_n = na - sweep*nbw call determine_workload(current_n, nbw, np_rows, limits) current_n_start = limits(my_prow) current_n_end = limits(my_prow+1) current_local_n = current_n_end - current_n_start next_n = max(current_n - nbw, 0) call determine_workload(next_n, nbw, np_rows, limits) next_n_start = limits(my_prow) next_n_end = limits(my_prow+1) next_local_n = next_n_end - next_n_start if (next_n_end < next_n) then bottom_msg_length = current_n_end - next_n_end else bottom_msg_length = 0 endif if (next_local_n > 0) then next_top_msg_length = current_n_start - next_n_start else next_top_msg_length = 0 endif #ifdef WITH_GPU_VERSION ! t2_cpu_code =MPI_Wtime() ! t0_cpu_code = t0_cpu_code + (t2_cpu_code - t1_cpu_code) #endif if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then do i = 1, stripe_count #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif csw = min(stripe_width, thread_width-(i-1)*stripe_width) ! "current_stripe_width" b_len = csw*nbw*max_threads #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #else call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, MPI_COMPLEX8, my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #endif #else /* WITH_MPI */ ! carefull the "recieve" has to be do done at the corresponding wait or send ! bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) #endif /* WITH_MPI */ #else /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #else call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX8, my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #endif #else /* WITH_MPI */ ! carefull the "recieve" has to be do done at the corresponding wait or send ! bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i) #endif /* WITH_MPI */ #endif /* WITH_OPENMP */ enddo endif #ifdef WITH_GPU_VERSION ! t1_block_time = MPI_Wtime() #endif if (current_local_n > 1) then if (my_pcol == mod(sweep,np_cols)) then bcast_buffer(:,1:current_local_n) = hh_trans_complex(:,current_tv_off+1:current_tv_off+current_local_n) current_tv_off = current_tv_off + current_local_n endif #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call mpi_bcast(bcast_buffer, nbw*current_local_n, MPI_COMPLEX16, mod(sweep,np_cols), mpi_comm_cols, mpierr) #else call mpi_bcast(bcast_buffer, nbw*current_local_n, MPI_COMPLEX8, mod(sweep,np_cols), mpi_comm_cols, mpierr) #endif #endif /* WITH_MPI */ if (useGPU) then #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_memcpy(bcast_buffer_dev, loc(bcast_buffer(1,1)), nbw * & current_local_n * size_of_double_complex_datatype , & cudaMemcpyHostToDevice) call extract_hh_tau_complex_gpu_double(nbw, current_local_n, .false.) call compute_hh_dot_products_complex_gpu_double(nbw, current_local_n) #else successCUDA = cuda_memcpy(bcast_buffer_dev, loc(bcast_buffer(1,1)), nbw * & current_local_n * size_of_single_complex_datatype , & cudaMemcpyHostToDevice) call extract_hh_tau_complex_gpu_single(nbw, current_local_n, .false.) call compute_hh_dot_products_complex_gpu_single(nbw, current_local_n) #endif endif else ! for current_local_n == 1 the one and only HH vector is 0 and not stored in hh_trans_complex #ifdef DOUBLE_PRECISION_COMPLEX bcast_buffer(:,1) = 0._ck8 #else bcast_buffer(:,1) = 0._ck4 #endif if (useGPU) then #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_memset(bcast_buffer_dev, 0, nbw * size_of_double_complex_datatype) #else successCUDA = cuda_memset(bcast_buffer_dev, 0, nbw * size_of_single_complex_datatype) #endif if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaMemset" stop endif #ifdef DOUBLE_PRECISION_COMPLEX call extract_hh_tau_complex_gpu_double(nbw, 1, .true.) #else call extract_hh_tau_complex_gpu_single(nbw, 1, .true.) #endif !NOTE(ca): I commented out the following line ! istat = cuda_memcpy(loc(bcast_buffer(1,1)),bcast_buffer_dev,nbw*current_local_n * size_of_complex_datatype , ! cudaMemcpyDeviceToHost) ! if (istat .ne. 0) then ! print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc" ! stop ! endif endif ! useGPU endif #ifdef WITH_GPU_VERSION ! t2_block_time =MPI_Wtime() ! t0_block_time = t0_block_time + ( t2_block_time - t1_block_time) #endif if (l_nev == 0) cycle if (current_local_n > 0) then #ifdef WITH_GPU_VERSION ! t1_inner_do_time =MPI_Wtime() #endif do i = 1, stripe_count #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif ! Get real stripe width for strip i; ! The last OpenMP tasks may have an even smaller stripe with, ! but we don't care about this, i.e. we send/recv a bit too much in this case. ! csw: current_stripe_width csw = min(stripe_width, thread_width-(i-1)*stripe_width) #endif /* WITH_OPENMP */ !wait_b if (current_n_end < current_n) then #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif #ifdef WITH_MPI call MPI_Wait(bottom_recv_request(i), mpi_status, mpierr) #endif #else /* WITH_OPENMP */ #ifdef WITH_GPU_VERSION ! t1_mpi_wait_time =MPI_Wtime() #endif #ifdef WITH_MPI call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr) #endif #ifdef WITH_GPU_VERSION ! t2_mpi_wait_time =MPI_Wtime() ! t0_mpi_wait_time = t0_mpi_wait_time + ( t2_mpi_wait_time - t1_mpi_wait_time) ! ! t1_memcpy_time =MPI_Wtime() #endif #endif /* WITH_OPENMP */ #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%start("OpenMP parallel_double") #else call timer%start("OpenMP parallel_single") #endif #endif !$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1) do my_thread = 1, max_threads n_off = current_local_n+a_off b_len = csw*nbw b_off = (my_thread-1)*b_len a(1:csw,n_off+1:n_off+nbw,i,my_thread) = & reshape(bottom_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, nbw /)) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%stop("OpenMP parallel_double") #else call timer%stop("OpenMP parallel_single") #endif #endif #else /* WITH_OPENMP */ n_off = current_local_n+a_off if (useGPU) then ! t1_memcpy_time =MPI_Wtime() #ifdef DOUBLE_PRECISION_COMPLEX dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * size_of_double_complex_datatype successCUDA = cuda_memcpy( a_dev + dev_offset ,loc(bottom_border_recv_buffer(1,1,i)), & stripe_width*nbw*size_of_double_complex_datatype ,cudaMemcpyHostToDevice) #else dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * size_of_single_complex_datatype successCUDA = cuda_memcpy( a_dev + dev_offset ,loc(bottom_border_recv_buffer(1,1,i)), & stripe_width*nbw*size_of_single_complex_datatype ,cudaMemcpyHostToDevice) #endif if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc" stop endif ! t2_memcpy_time =MPI_Wtime() ! t0_memcpy_time = t0_memcpy_time + ( t2_memcpy_time - t1_memcpy_time) else a(:,n_off+1:n_off+nbw,i) = bottom_border_recv_buffer(:,1:nbw,i) endif #endif /* WITH_OPENMP */ if (next_n_end < next_n) then #ifdef WITH_OPENMP if (useGPU) then print *,"not yet implemented" stop endif #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, & MPI_COMPLEX16, my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #else call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, & MPI_COMPLEX8, my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #endif #else /* WITH_MPI */ ! carefull the "recieve" has to be do done at the corresponding wait or send ! bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #else call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX8, my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #endif #else /* WITH_MPI */ ! carefull the "recieve" has to be do done at the corresponding wait or send ! bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i) #endif /* WITH_MPI */ #endif /* WITH_OPENMP */ endif endif if (current_local_n <= bottom_msg_length + top_msg_length) then !wait_t if (top_msg_length>0) then #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif #ifdef WITH_MPI call MPI_Wait(top_recv_request(i), mpi_status, mpierr) #endif #else /* WITH_OPENMP */ #ifdef WITH_GPU_VERSION ! t1_mpi_wait_time =MPI_Wtime() #endif #ifdef WITH_MPI call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) #endif if (useGPU) then ! t2_mpi_wait_time =MPI_Wtime() ! t0_mpi_wait_time = t0_mpi_wait_time + ( t2_mpi_wait_time -t1_mpi_wait_time) ! t1_memcpy_time =MPI_Wtime() ! #ifdef DOUBLE_PRECISION_COMPLEX dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * & size_of_double_complex_datatype #else dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * & size_of_single_complex_datatype #endif ! host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ))* 16 #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_memcpy( a_dev+dev_offset ,loc(top_border_recv_buffer(1,1,i)), & stripe_width*top_msg_length*size_of_double_complex_datatype , cudaMemcpyHostToDevice) #else successCUDA = cuda_memcpy( a_dev+dev_offset ,loc(top_border_recv_buffer(1,1,i)), & stripe_width*top_msg_length*size_of_single_complex_datatype , cudaMemcpyHostToDevice) #endif if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy" stop endif ! t2_memcpy_time =MPI_Wtime() ! t0_memcpy_time = t0_memcpy_time + ( t2_memcpy_time - t1_memcpy_time) else a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i) endif ! useGPU #endif /* WITH_OPENMP */ endif !compute #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%start("OpenMP parallel_double") #else call timer%start("OpenMP parallel_single") #endif #endif !$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1) do my_thread = 1, max_threads if (top_msg_length>0) then b_len = csw*top_msg_length b_off = (my_thread-1)*b_len a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = & reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /)) endif #ifdef DOUBLE_PRECISION_COMPLEX call compute_hh_trafo_complex_cpu_openmp_double(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & 0, current_local_n, i, my_thread, thread_width, & THIS_COMPLEX_ELPA_KERNEL) #else call compute_hh_trafo_complex_cpu_openmp_single(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & 0, current_local_n, i, my_thread, thread_width, & THIS_COMPLEX_ELPA_KERNEL) #endif enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%stop("OpenMP parallel_double") #else call timer%stop("OpenMP parallel_single") #endif #endif #else /* WITH_OPENMP */ if (useGPU) then #ifdef DOUBLE_PRECISION_COMPLEX call compute_hh_trafo_complex_gpu_double(0, current_local_n, i, a_off, dev_offset, dev_offset_1, dev_offset_2) ! call compute_hh_trafo_complex_gpu_double(0, current_local_n, i) #else call compute_hh_trafo_complex_gpu_single(0, current_local_n, i, a_off, dev_offset, dev_offset_1, dev_offset_2) ! call compute_hh_trafo_complex_gpu_single(0, current_local_n, i) #endif else #ifdef DOUBLE_PRECISION_COMPLEX call compute_hh_trafo_complex_cpu_double(a, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & 0, current_local_n, i, last_stripe_width, & THIS_COMPLEX_ELPA_KERNEL) #else call compute_hh_trafo_complex_cpu_single(a, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & 0, current_local_n, i, last_stripe_width, & THIS_COMPLEX_ELPA_KERNEL) #endif endif #endif /* WITH_OPENMP */ !send_b #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif #ifdef WITH_MPI call MPI_Wait(bottom_send_request(i), mpi_status, mpierr) #endif #else /* WITH_OPENMP */ #ifdef WITH_GPU_VERSION ! t1_mpi_wait_time =MPI_Wtime() #endif #ifdef WITH_MPI call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) #endif #ifdef WITH_GPU_VERSION ! t2_mpi_wait_time =MPI_Wtime() ! t0_mpi_wait_time = t0_mpi_wait_time + ( t2_mpi_wait_time-t1_mpi_wait_time) #endif #endif /* WITH_OPENMP */ if (bottom_msg_length>0) then n_off = current_local_n+nbw-bottom_msg_length+a_off #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif b_len = csw*bottom_msg_length*max_threads bottom_border_send_buffer(1:b_len,i) = & reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /)) #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, & top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) #else call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX8, my_prow+1, & top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) #endif #else /* WITH_MPI */ if (next_top_msg_length > 0) then top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = & bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i) endif #endif /* WITH_MPI */ #else /* WITH_OPENMP */ if (useGPU) then ! t1_memcpy_time =MPI_Wtime() #ifdef DOUBLE_PRECISION_COMPLEX dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * & size_of_double_complex_datatype successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), a_dev + dev_offset, & stripe_width * bottom_msg_length * size_of_double_complex_datatype , & cudaMemcpyDeviceToHost) #else dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * & size_of_single_complex_datatype successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), a_dev + dev_offset, & stripe_width * bottom_msg_length * size_of_single_complex_datatype , & cudaMemcpyDeviceToHost) #endif if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy" stop endif ! t2_memcpy_time =MPI_Wtime() ! t0_memcpy_time = t0_memcpy_time + ( t2_memcpy_time -t1_memcpy_time) else bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i) endif #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX16, my_prow+1, & top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) #else call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX16, my_prow+1, & top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) #endif #else /* WITH_MPI */ if (next_top_msg_length > 0) then top_border_recv_buffer(1:next_top_msg_length*stripe_width,1,i) = & bottom_border_send_buffer(1:bottom_msg_length*stripe_width,1,i) endif #endif /* WITH_MPI */ #endif /* WITH_OPENMP */ endif else !compute #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%start("OpenMP parallel_double") #else call timer%start("OpenMP parallel_single") #endif #endif !$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1) do my_thread = 1, max_threads #ifdef DOUBLE_PRECISION_COMPLEX call compute_hh_trafo_complex_cpu_openmp_double(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, & thread_width, THIS_COMPLEX_ELPA_KERNEL) #else call compute_hh_trafo_complex_cpu_openmp_single(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, & thread_width, THIS_COMPLEX_ELPA_KERNEL) #endif enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%stop("OpenMP parallel_double") #else call timer%stop("OpenMP parallel_single") #endif #endif #else /* WITH_OPENMP */ #ifdef DOUBLE_PRECISION_COMPLEX if (useGPU) then call compute_hh_trafo_complex_gpu_double(current_local_n -bottom_msg_length, bottom_msg_length, i, a_off, & dev_offset, dev_offset_1, dev_offset_2) ! call compute_hh_trafo_complex_gpu_double(current_local_n -bottom_msg_length, bottom_msg_length, i) else call compute_hh_trafo_complex_cpu_double(a, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & current_local_n - bottom_msg_length, bottom_msg_length, i, & last_stripe_width, THIS_COMPLEX_ELPA_KERNEL) endif #else if (useGPU) then call compute_hh_trafo_complex_gpu_single(current_local_n -bottom_msg_length, bottom_msg_length, i, a_off, & dev_offset, dev_offset_1, dev_offset_2) ! call compute_hh_trafo_complex_gpu_single(current_local_n -bottom_msg_length, bottom_msg_length, i) else call compute_hh_trafo_complex_cpu_single(a, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & current_local_n - bottom_msg_length, bottom_msg_length, i, & last_stripe_width, THIS_COMPLEX_ELPA_KERNEL) endif #endif #endif /* WITH_OPENMP */ !send_b #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif #ifdef WITH_MPI call MPI_Wait(bottom_send_request(i), mpi_status, mpierr) #endif #else /* WITH_OPENMP */ #ifdef WITH_GPU_VERSION ! t1_mpi_wait_time =MPI_Wtime() #endif #ifdef WITH_MPI call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) #endif #ifdef WITH_GPU_VERSION ! t2_mpi_wait_time =MPI_Wtime() ! t0_mpi_wait_time = t0_mpi_wait_time + ( t2_mpi_wait_time-t1_mpi_wait_time) #endif #endif /* WITH_OPENMP */ if (bottom_msg_length > 0) then n_off = current_local_n+nbw-bottom_msg_length+a_off #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif b_len = csw*bottom_msg_length*max_threads bottom_border_send_buffer(1:b_len,i) = & reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /)) #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, & top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) #else call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX8, my_prow+1, & top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) #endif #else /* WITH_MPI */ if (next_top_msg_length > 0) then top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = & bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i) endif #endif /* WITH_MPI */ #else /* WITH_OPENMP */ if (useGPU) then ! t1_memcpy_time =MPI_Wtime() #ifdef DOUBLE_PRECISION_COMPLEX dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * & size_of_double_complex_datatype successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), a_dev + dev_offset, & stripe_width * bottom_msg_length * size_of_double_complex_datatype , & cudaMemcpyDeviceToHost) #else dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * & size_of_single_complex_datatype successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), a_dev + dev_offset, & stripe_width * bottom_msg_length * size_of_single_complex_datatype , & cudaMemcpyDeviceToHost) #endif if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy" stop endif ! t2_memcpy_time =MPI_Wtime() ! t0_memcpy_time = t0_memcpy_time + ( t2_memcpy_time -t1_memcpy_time) else bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i) endif #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX16, my_prow+1, & top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) #else call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX8, my_prow+1, & top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) #endif #else /* WITH_MPI */ if (next_top_msg_length > 0) then top_border_recv_buffer(1:next_top_msg_length*stripe_width,1,i) = & bottom_border_send_buffer(1:bottom_msg_length*stripe_width,1,i) endif #endif /* WITH_MPI */ #endif /* WITH_OPENMP */ endif !compute #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_complex: not yet implemented" stop endif #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%start("OpenMP parallel_double") #else call timer%start("OpenMP parallel_single") #endif #endif !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads #ifdef DOUBLE_PRECISION_COMPLEX call compute_hh_trafo_complex_cpu_openmp_double(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, & kernel_time, top_msg_length, & current_local_n-top_msg_length-bottom_msg_length, i, & my_thread, thread_width, THIS_COMPLEX_ELPA_KERNEL) #else call compute_hh_trafo_complex_cpu_openmp_single(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, & kernel_time, top_msg_length, & current_local_n-top_msg_length-bottom_msg_length, i, & my_thread, thread_width, THIS_COMPLEX_ELPA_KERNEL) #endif enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%stop("OpenMP parallel_double") #else call timer%stop("OpenMP parallel_single") #endif #endif #else /* WITH_OPENMP */ #ifdef DOUBLE_PRECISION_COMPLEX if (useGPU) then ! call compute_hh_trafo_complex_gpu_double(top_msg_length,current_local_n-top_msg_length-bottom_msg_length, i) call compute_hh_trafo_complex_gpu_double(top_msg_length,current_local_n-top_msg_length-bottom_msg_length, & i, a_off, & dev_offset, dev_offset_1, dev_offset_2) else call compute_hh_trafo_complex_cpu_double(a, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, & last_stripe_width, THIS_COMPLEX_ELPA_KERNEL) #else if (useGPU) then ! call compute_hh_trafo_complex_gpu_single(top_msg_length,current_local_n-top_msg_length-bottom_msg_length, i) call compute_hh_trafo_complex_gpu_single(top_msg_length,current_local_n-top_msg_length-bottom_msg_length, & i, a_off, & dev_offset, dev_offset_1, dev_offset_2) else call compute_hh_trafo_complex_cpu_single(a, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, & last_stripe_width, THIS_COMPLEX_ELPA_KERNEL) #endif endif #endif /* WITH_OPENMP */ !wait_t if (top_msg_length>0) then #ifdef WITH_OPENMP #ifdef WITH_MPI call MPI_Wait(top_recv_request(i), mpi_status, mpierr) #endif #else /* WITH_OPENMP */ #ifdef WITH_GPU_VERSION ! t1_mpi_wait_time =MPI_Wtime() #endif #ifdef WITH_MPI call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) #endif if (useGPU) then ! t2_mpi_wait_time =MPI_Wtime() ! t0_mpi_wait_time = t0_mpi_wait_time +(t2_mpi_wait_time-t1_mpi_wait_time) ! ! t1_memcpy_time =MPI_Wtime() #ifdef DOUBLE_PRECISION_COMPLEX dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * & size_of_double_complex_datatype successCUDA = cuda_memcpy( a_dev + dev_offset , loc(top_border_recv_buffer(:,1,i)), & stripe_width * top_msg_length *size_of_double_complex_datatype , & cudaMemcpyHostToDevice) #else dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * & size_of_single_complex_datatype successCUDA = cuda_memcpy( a_dev + dev_offset , loc(top_border_recv_buffer(:,1,i)), & stripe_width * top_msg_length *size_of_single_complex_datatype , & cudaMemcpyHostToDevice) #endif if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy" stop endif ! ! t2_memcpy_time =MPI_Wtime() ! t0_memcpy_time = t0_memcpy_time + ( t2_memcpy_time-t1_memcpy_time) else a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i) endif #endif /* WITH_OPENMP */ endif !compute #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%start("OpenMP parallel_double") #else call timer%start("OpenMP parallel_single") #endif #endif !$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1) do my_thread = 1, max_threads if (top_msg_length>0) then b_len = csw*top_msg_length b_off = (my_thread-1)*b_len a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = & reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /)) endif #ifdef DOUBLE_PRECISION_COMPLEX call compute_hh_trafo_complex_cpu_openmp_double(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & 0, top_msg_length, i, my_thread, thread_width, & THIS_COMPLEX_ELPA_KERNEL) #else call compute_hh_trafo_complex_cpu_openmp_single(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & 0, top_msg_length, i, my_thread, thread_width, & THIS_COMPLEX_ELPA_KERNEL) #endif enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%stop("OpenMP parallel_double") #else call timer%stop("OpenMP parallel_single") #endif #endif #else /* WITH_OPENMP */ #ifdef DOUBLE_PRECISION_COMPLEX if (useGPU) then call compute_hh_trafo_complex_gpu_double(0, top_msg_length, i, a_off, dev_offset, dev_offset_1, dev_offset_2) ! call compute_hh_trafo_complex_gpu_double(0, top_msg_length, i) else call compute_hh_trafo_complex_cpu_double(a, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & 0, top_msg_length, i, last_stripe_width, & THIS_COMPLEX_ELPA_KERNEL) endif #else if (useGPU) then call compute_hh_trafo_complex_gpu_single(0, top_msg_length, i, a_off, dev_offset, dev_offset_1, dev_offset_2) ! call compute_hh_trafo_complex_gpu_single(0, top_msg_length, i) else call compute_hh_trafo_complex_cpu_single(a, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & 0, top_msg_length, i, last_stripe_width, & THIS_COMPLEX_ELPA_KERNEL) endif #endif #endif /* WITH_OPENMP */ endif if (next_top_msg_length > 0) then !request top_border data #ifdef WITH_OPENMP b_len = csw*next_top_msg_length*max_threads #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call MPI_Irecv(top_border_recv_buffer(1,i), b_len, MPI_COMPLEX16, my_prow-1, & top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) #else call MPI_Irecv(top_border_recv_buffer(1,i), b_len, MPI_COMPLEX8, my_prow-1, & top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) #endif #else /* WITH_MPI */ ! carefull the "recieve" has to be done at the corresponding send or wait ! top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i) #endif /* WITH_MPI */ #else /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, MPI_COMPLEX16, my_prow-1, & top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) #else call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, MPI_COMPLEX8, my_prow-1, & top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) #endif #else /* WITH_MPI */ ! carefull the "recieve" has to be done at the corresponding send or wait ! top_border_recv_buffer(1:next_top_msg_length*stripe_width,1,i) = & ! bottom_border_send_buffer(1:bottom_msg_length*stripe_width,1,i) #endif /* WITH_MPI */ #endif /* WITH_OPENMP */ endif !send_t if (my_prow > 0) then #ifdef WITH_OPENMP #ifdef WITH_MPI call MPI_Wait(top_send_request(i), mpi_status, mpierr) #endif #else /* WITH_OPENMP */ #ifdef WITH_GPU_VERSION ! t1_mpi_wait_time =MPI_Wtime() #endif #ifdef WITH_MPI call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) #endif #ifdef WITH_GPU_VERSION ! t2_mpi_wait_time =MPI_Wtime() ! t0_mpi_wait_time = t0_mpi_wait_time+(t2_mpi_wait_time-t1_mpi_wait_time) #endif #endif /* WITH_OPENMP */ #ifdef WITH_OPENMP b_len = csw*nbw*max_threads top_border_send_buffer(1:b_len,i) = reshape(a(1:csw,a_off+1:a_off+nbw,i,:), (/ b_len /)) #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call MPI_Isend(top_border_send_buffer(1,i), b_len, MPI_COMPLEX16, & my_prow-1, bottom_recv_tag, & mpi_comm_rows, top_send_request(i), mpierr) #else call MPI_Isend(top_border_send_buffer(1,i), b_len, MPI_COMPLEX8, & my_prow-1, bottom_recv_tag, & mpi_comm_rows, top_send_request(i), mpierr) #endif #else /* WITH_MPI */ if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) endif if (next_n_end < next_n) then bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) endif #endif /* WITH_MPI */ #else /* WITH_OPENMP */ if (useGPU) then ! t1_memcpy_time =MPI_Wtime() #ifdef DOUBLE_PRECISION_COMPLEX dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * & size_of_double_complex_datatype successCUDA = cuda_memcpy( loc(top_border_send_buffer(:,1,i)), a_dev + dev_offset, & stripe_width*nbw*size_of_double_complex_datatype ,cudaMemcpyDeviceToHost) #else dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * & size_of_single_complex_datatype successCUDA = cuda_memcpy( loc(top_border_send_buffer(:,1,i)), a_dev + dev_offset, & stripe_width*nbw*size_of_single_complex_datatype ,cudaMemcpyDeviceToHost) #endif if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy" stop endif ! t2_memcpy_time =MPI_Wtime() ! t0_memcpy_time = t0_memcpy_time + (t2_memcpy_time-t1_memcpy_time) ! else top_border_send_buffer(:,1:nbw,i) = a(:,a_off+1:a_off+nbw,i) endif #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow-1, bottom_recv_tag, & mpi_comm_rows, top_send_request(i), mpierr) #else call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX8, my_prow-1, bottom_recv_tag, & mpi_comm_rows, top_send_request(i), mpierr) #endif #else /* WITH_MPI */ if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then bottom_border_recv_buffer(1:nbw,1:stripe_width,i) = top_border_send_buffer(1:nbw,1:stripe_width,i) endif if (next_n_end < next_n) then bottom_border_recv_buffer(1:nbw,1:stripe_width,i) = top_border_send_buffer(1:nbw,1:stripe_width,i) endif #endif /* WITH_MPI */ #endif /* WITH_OPENMP */ endif ! Care that there are not too many outstanding top_recv_request's #ifdef WITH_GPU_VERSION ! t1_mpi_wait_time =MPI_Wtime() #endif if (stripe_count > 1) then if (i>1) then #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Wait(top_recv_request(i-1), mpi_status, mpierr) #else call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr) #endif #endif /* WITH_MPI */ else #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Wait(top_recv_request(stripe_count), mpi_status, mpierr) #else call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr) #endif #endif /* WITH_MPI */ endif endif #ifdef WITH_GPU_VERSION ! t2_mpi_wait_time =MPI_Wtime() ! t0_mpi_wait_time = t0_mpi_wait_time+(t2_mpi_wait_time-t1_mpi_wait_time) #endif enddo #ifdef WITH_GPU_VERSION ! t2_inner_do_time =MPI_Wtime() ! t0_inner_do_time = t0_inner_do_time + ( t2_inner_do_time - t1_inner_do_time) #endif top_msg_length = next_top_msg_length else ! wait for last top_send_request #ifdef WITH_GPU_VERSION ! t1_mpi_outer_wait_time =MPI_Wtime() #endif do i = 1, stripe_count #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Wait(top_send_request(i), mpi_status, mpierr) #else call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) #endif #endif /* WITH_MPI */ enddo #ifdef WITH_GPU_VERSION ! t2_mpi_outer_wait_time =MPI_Wtime() ! t0_mpi_outer_wait_time =t0_mpi_outer_wait_time+(t2_mpi_outer_wait_time-t1_mpi_outer_wait_time) #endif endif #ifdef WITH_GPU_VERSION ! t0_result_time = MPI_Wtime() #endif ! Care about the result if (my_prow == 0) then ! topmost process sends nbw rows to destination processes do j=0,nfact-1 num_blk = sweep*nfact+j ! global number of destination block, 0 based if (num_blk*nblk >= na) exit nbuf = mod(num_blk, num_result_buffers) + 1 ! buffer number to get this block #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Wait(result_send_request(nbuf), mpi_status, mpierr) #else call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr) #endif #endif /* WITH_MPI */ dst = mod(num_blk, np_rows) if (dst == 0) then if (useGPU) then row_group_size = min(na - num_blk*nblk, nblk) #ifdef DOUBLE_PRECISION_COMPLEX call pack_row_group_complex_gpu_double(row_group(:, :), j * nblk + a_off,row_group_size) #else call pack_row_group_complex_gpu_single(row_group(:, :), j * nblk + a_off,row_group_size) #endif do i = 1, row_group_size q((num_blk / np_rows) * nblk + i, 1 : l_nev) = row_group(:, i) enddo else do i = 1, min(na - num_blk*nblk, nblk) #ifdef DOUBLE_PRECISION_COMPLEX #ifdef WITH_OPENMP call pack_row_complex_cpu_openmp_double(a, row, j*nblk+i+a_off, & stripe_width, stripe_count, max_threads, thread_width, l_nev) #else call pack_row_complex_cpu_double(a, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count) #endif #else #ifdef WITH_OPENMP call pack_row_complex_cpu_openmp_single(a, row, j*nblk+i+a_off, & stripe_width, stripe_count, max_threads, thread_width, l_nev) #else call pack_row_complex_cpu_single(a, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count) #endif #endif q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:) enddo endif else if (useGPU) then #ifdef DOUBLE_PRECISION_COMPLEX call pack_row_group_complex_gpu_double(result_buffer(:, :, nbuf), j * nblk + a_off, nblk) #else call pack_row_group_complex_gpu_single(result_buffer(:, :, nbuf), j * nblk + a_off, nblk) #endif else do i = 1, nblk #ifdef DOUBLE_PRECISION_COMPLEX #ifdef WITH_OPENMP call pack_row_complex_cpu_openmp_double(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, & stripe_width, stripe_count, max_threads, thread_width, l_nev) #else call pack_row_complex_cpu_double(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, & last_stripe_width, stripe_count) #endif #else #ifdef WITH_OPENMP call pack_row_complex_cpu_openmp_single(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, & stripe_width, stripe_count, max_threads, thread_width, l_nev) #else call pack_row_complex_cpu_single(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, & last_stripe_width, stripe_count) #endif #endif enddo endif #ifdef WITH_MPI #ifdef DOUBLE_PRECISION_COMPLEX call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, MPI_COMPLEX16, dst, & result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr) #else call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, MPI_COMPLEX8, dst, & result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr) #endif #else /* WITH_MPI */ if (j+num_result_buffers < num_result_blocks) & result_buffer(1:l_nev,1:nblk,nbuf) = result_buffer(1:l_nev,1:nblk,nbuf) if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends do j1 = 1, min(num_result_buffers, num_result_blocks) result_buffer(1:l_nev,1:nblk,j1) = result_buffer(1:l_nev,1:nblk,nbuf) enddo endif #endif /* WITH_MPI */ endif enddo else ! receive and store final result do j = num_bufs_recvd, num_result_blocks-1 nbuf = mod(j, num_result_buffers) + 1 ! buffer number to get this block ! If there is still work to do, just test for the next result request ! and leave the loop if it is not ready, otherwise wait for all ! outstanding requests if (next_local_n > 0) then #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Test(result_recv_request(nbuf), flag, mpi_status, mpierr) #else call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr) #endif #else /* WITH_MPI */ flag = .true. #endif /* WITH_MPI */ if (.not.flag) exit else #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Wait(result_recv_request(nbuf), mpi_status, mpierr) #else call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr) #endif #else /* WITH_MPI */ #endif /* WITH_MPI */ endif ! Fill result buffer into q num_blk = j*np_rows + my_prow ! global number of current block, 0 based do i = 1, min(na - num_blk*nblk, nblk) q(j*nblk+i, 1:l_nev) = result_buffer(1:l_nev, i, nbuf) enddo ! Queue result buffer again if there are outstanding blocks left #ifdef WITH_MPI if (j+num_result_buffers < num_result_blocks) & #ifdef DOUBLE_PRECISION_COMPLEX call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, MPI_COMPLEX16, 0, result_recv_tag, & mpi_comm_rows, result_recv_request(nbuf), mpierr) #else call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, MPI_COMPLEX8, 0, result_recv_tag, & mpi_comm_rows, result_recv_request(nbuf), mpierr) #endif #else /* WITH_MPI */ ! carefull "recieve" has to be done at corresponding wait or send ! if (j+num_result_buffers < num_result_blocks) & ! result_buffer(1:l_nev*nblk,1,nbuf) = result_buffer(1:l_nev*nblk,1,nbuf) #endif /* WITH_MPI */ enddo num_bufs_recvd = j endif #ifdef WITH_GPU_VERSION ! t2_result_time =MPI_Wtime() ! t0_result_time = t0_result_time + ( t2_result_time - t1_result_time) #endif ! Shift the remaining rows to the front of A (if necessary) offset = nbw - top_msg_length if (offset<0) then if (wantDebug) then write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: internal error, offset for shifting = ',offset endif success = .false. return endif a_off = a_off + offset if (a_off + next_local_n + nbw > a_dim2) then #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%start("OpenMP parallel_double") #else call timer%start("OpenMP parallel_single") #endif #endif !$omp parallel do private(my_thread, i, j), schedule(static, 1) do my_thread = 1, max_threads do i = 1, stripe_count do j = top_msg_length+1, top_msg_length+next_local_n A(:,j,i,my_thread) = A(:,j+a_off,i,my_thread) enddo enddo enddo #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%stop("OpenMP parallel_double") #else call timer%stop("OpenMP parallel_single") #endif #endif #else /*WITH_OPENMP */ do i = 1, stripe_count if (useGPU) then chunk = min(next_local_n - 1, a_off) do j = top_msg_length + 1, top_msg_length + next_local_n, chunk top = min(j + chunk, top_msg_length + next_local_n) this_chunk = top - j + 1 #ifdef DOUBLE_PRECISION_COMPLEX dev_offset = (0 + ( (j-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * & size_of_double_complex_datatype dev_offset_1 = (0 + ( (j + a_off-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * & size_of_double_complex_datatype ! it is not logical to set here always the parameter "cudaMemcpyDeviceToDevice" do this ONCE at startup ! tmp = cuda_d2d(1) successCUDA = cuda_memcpy( a_dev + dev_offset , a_dev+dev_offset_1, & stripe_width*this_chunk*size_of_double_complex_datatype, cudaMemcpyDeviceToDevice) #else dev_offset = (0 + ( (j-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * & size_of_single_complex_datatype dev_offset_1 = (0 + ( (j + a_off-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * & size_of_single_complex_datatype ! it is not logical to set here always the parameter "cudaMemcpyDeviceToDevice" do this ONCE at startup ! tmp = cuda_d2d(1) successCUDA = cuda_memcpy( a_dev + dev_offset , a_dev+dev_offset_1, & stripe_width*this_chunk*size_of_single_complex_datatype, cudaMemcpyDeviceToDevice) #endif if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy" stop endif enddo else ! useGPU do j = top_msg_length+1, top_msg_length+next_local_n A(:,j,i) = A(:,j+a_off,i) enddo endif enddo #endif /*WITH_OPENMP */ a_off = 0 endif enddo #ifdef WITH_GPU_VERSION ! t2_outer_do_time =MPI_Wtime() ! t0_outer_do_time = t0_outer_do_time + ( t2_outer_do_time - t1_outer_do_time) ! ! istat = cuda_ProfilerStop() #endif ! Just for safety: #ifdef WITH_MPI if (ANY(top_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_send_request ***',my_prow,my_pcol if (ANY(bottom_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_send_request ***',my_prow,my_pcol if (ANY(top_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_recv_request ***',my_prow,my_pcol if (ANY(bottom_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_recv_request ***',my_prow,my_pcol #endif if (my_prow == 0) then #ifdef WITH_MPI #ifdef WITH_OPENMP allocate(mpi_statuses(MPI_STATUS_SIZE,num_result_buffers), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error allocating mpi_statuses "//errorMessage stop endif call MPI_Waitall(num_result_buffers, result_send_request, mpi_statuses, mpierr) deallocate(mpi_statuses, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error deallocating mpi_statuses "//errorMessage stop endif #else call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr) #endif #endif /* WITH_MPI */ endif #ifdef WITH_MPI if (ANY(result_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_send_request ***',my_prow,my_pcol if (ANY(result_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_recv_request ***',my_prow,my_pcol #endif if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,'(" Kernel time:",f10.3," MFlops: ",es12.5)') kernel_time, kernel_flops/kernel_time*1.d-6 ! deallocate all working space if (.not.(useGPU)) then nullify(a) call free(a_ptr) ! deallocate(a, stat=istat, errmsg=errorMessage) ! if (istat .ne. 0) then ! print *,"trans_ev_tridi_to_band_complex: error deallocating a "//errorMessage ! stop ! endif endif deallocate(row, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error deallocating row "//errorMessage stop endif deallocate(limits, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error deallocating limits "//errorMessage stop endif deallocate(result_send_request, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error deallocating result_send_request "//errorMessage stop endif deallocate(result_recv_request, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error deallocating result_recv_request "//errorMessage stop endif deallocate(top_border_send_buffer, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error deallocating top_border_send_buffer "//errorMessage stop endif deallocate(top_border_recv_buffer, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error deallocating top_border_recv_buffer "//errorMessage stop endif deallocate(bottom_border_send_buffer, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error deallocating top_border_send_buffer "//errorMessage stop endif deallocate(bottom_border_recv_buffer, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error deallocating bottom_border_recv_buffer "//errorMessage stop endif deallocate(result_buffer, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error deallocating result_buffer "//errorMessage stop endif deallocate(bcast_buffer, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error deallocating bcast_buffer "//errorMessage stop endif deallocate(top_send_request, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error deallocating top_send_request "//errorMessage stop endif deallocate(top_recv_request, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error deallocating top_recv_request "//errorMessage stop endif deallocate(bottom_send_request, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error deallocating bottom_send_request "//errorMessage stop endif deallocate(bottom_recv_request, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error deallocating bottom_recv_request "//errorMessage stop endif if (useGPU) then successCUDA = cuda_free(a_dev) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaFree" stop endif successCUDA = cuda_free(hh_tau_dev) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaFree" stop endif successCUDA = cuda_free(hh_dot_dev) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaFree" stop endif successCUDA = cuda_free(row_dev) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaFree" stop endif deallocate(row_group, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_complex: error deallocating row_group "//errorMessage stop endif successCUDA= cuda_free(row_group_dev) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaFree" stop endif successCUDA = cuda_free(bcast_buffer_dev) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaFree" stop endif endif ! useGPU #ifdef HAVE_DETAILED_TIMINGS #ifdef DOUBLE_PRECISION_COMPLEX call timer%stop("trans_ev_tridi_to_band_complex_double") #else call timer%stop("trans_ev_tridi_to_band_complex_single") #endif #endif return contains #ifdef DOUBLE_PRECISION_COMPLEX ! The host wrapper for extracting "tau" from the HH reflectors (see the ! kernel below) subroutine extract_hh_tau_complex_gpu_double(nbw, n, is_zero) #else subroutine extract_hh_tau_complex_gpu_single(nbw, n, is_zero) #endif use cuda_c_kernel use precision implicit none integer(kind=ik), value :: nbw, n logical, value :: is_zero integer(kind=ik) :: val_is_zero if (is_zero) then val_is_zero = 1 else val_is_zero = 0 endif #ifdef DOUBLE_PRECISION_COMPLEX call launch_extract_hh_tau_c_kernel_complex_double(bcast_buffer_dev,hh_tau_dev, nbw, n,val_is_zero) #else call launch_extract_hh_tau_c_kernel_complex_single(bcast_buffer_dev,hh_tau_dev, nbw, n,val_is_zero) #endif end subroutine #ifdef DOUBLE_PRECISION_COMPLEX subroutine compute_hh_dot_products_complex_gpu_double(nbw, n) #else subroutine compute_hh_dot_products_complex_gpu_single(nbw, n) #endif use cuda_c_kernel use precision implicit none integer(kind=ik), value :: nbw, n if (n .le. 1) return #ifdef DOUBLE_PRECISION_COMPLEX call launch_compute_hh_dotp_c_kernel_complex_double( bcast_buffer_dev, hh_dot_dev, nbw,n) #else call launch_compute_hh_dotp_c_kernel_complex_single( bcast_buffer_dev, hh_dot_dev, nbw,n) #endif end subroutine #ifdef DOUBLE_PRECISION_COMPLEX subroutine pack_row_group_complex_gpu_double(rows, n_offset, row_count) #else subroutine pack_row_group_complex_gpu_single(rows, n_offset, row_count) #endif use cuda_c_kernel use precision implicit none integer(kind=ik), intent(in) :: n_offset, row_count complex(kind=COMPLEX_DATATYPE) :: rows(:,:) integer(kind=ik) :: max_idx logical :: successCUDA max_idx = (stripe_count - 1) * stripe_width + last_stripe_width #ifdef DOUBLE_PRECISION_COMPLEX call launch_my_pack_c_kernel_complex_double(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, & l_nev, a_dev, row_group_dev) successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_double_complex_datatype, & cudaMemcpyDeviceToHost) #else call launch_my_pack_c_kernel_complex_single(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, & l_nev, a_dev, row_group_dev) successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_single_complex_datatype, & cudaMemcpyDeviceToHost) #endif if (.not.(successCUDA)) then print *,"pack_row_group_complex_gpu: error in cudaMemcpy" stop endif end subroutine #ifdef DOUBLE_PRECISION_COMPLEX subroutine unpack_row_group_complex_gpu_double(rows, n_offset, row_count) #else subroutine unpack_row_group_complex_gpu_single(rows, n_offset, row_count) #endif use cuda_c_kernel use precision implicit none integer(kind=ik), intent(in) :: n_offset, row_count complex(kind=COMPLEX_DATATYPE), intent(in) :: rows(:, :) integer(kind=ik) :: max_idx integer(kind=ik) :: i logical :: successCUDA max_idx = (stripe_count - 1) * stripe_width + last_stripe_width #ifdef DOUBLE_PRECISION_COMPLEX successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_double_complex_datatype , & cudaMemcpyHostToDevice) #else successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_single_complex_datatype , & cudaMemcpyHostToDevice) #endif if (.not.(successCUDA)) then print *,"unpack_row_group_complex_gpu: error in cudaMemcpy" stop endif #ifdef DOUBLE_PRECISION_COMPLEX call launch_my_unpack_c_kernel_complex_double( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, & row_group_dev,a_dev) #else call launch_my_unpack_c_kernel_complex_single( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, & row_group_dev,a_dev) #endif end subroutine #ifdef DOUBLE_PRECISION_COMPLEX subroutine unpack_and_prepare_row_group_complex_gpu_double(next_unpack_idx, force) #else subroutine unpack_and_prepare_row_group_complex_gpu_single(next_unpack_idx, force) #endif use precision implicit none integer(kind=ik), intent(in) :: next_unpack_idx logical, intent(in) :: force if (row_group_size == 0) then ! Nothing to flush, just prepare for the upcoming row row_group_size = 1 else if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /=next_unpack_idx)) then ! A flush and a reset must performed #ifdef DOUBLE_PRECISION_COMPLEX call unpack_row_group_complex_gpu_double(row_group(:, :), unpack_idx - row_group_size, row_group_size) #else call unpack_row_group_complex_gpu_single(row_group(:, :), unpack_idx - row_group_size, row_group_size) #endif row_group_size = 1 else ! Just prepare for the upcoming row row_group_size = row_group_size + 1 endif endif ! Always update the index for the upcoming row unpack_idx = next_unpack_idx end subroutine #ifdef DOUBLE_PRECISION_COMPLEX subroutine compute_hh_trafo_complex_gpu_double(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2) #else subroutine compute_hh_trafo_complex_gpu_single(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2) #endif use iso_c_binding use cuda_c_kernel use precision implicit none integer(kind=ik), intent(in) :: off, ncols, istripe integer(kind=ik) :: nl real(kind=c_double) :: ttt ! MPI_WTIME always needs double integer(kind=ik) :: a_off integer(kind=c_size_t) :: dev_offset, dev_offset_1, dev_offset_2 if (ncols < 1) return ttt = mpi_wtime() nl = merge(stripe_width, last_stripe_width, istripe < stripe_count) #ifdef DOUBLE_PRECISION_COMPLEX dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * & size_of_double_complex_datatype dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_double_complex_datatype dev_offset_2 =( off-1 )*size_of_double_complex_datatype ! t1_compute_kernel =MPI_Wtime() call launch_compute_hh_trafo_c_kernel_complex_double(a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, & hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols) #else dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * & size_of_single_complex_datatype dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_single_complex_datatype dev_offset_2 =( off-1 )*size_of_single_complex_datatype ! t1_compute_kernel =MPI_Wtime() call launch_compute_hh_trafo_c_kernel_complex_single(a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, & hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols) #endif ! time0 = time0 + time1 ! t2_compute_kernel =MPI_Wtime() ! t0_compute_kernel = t0_compute_kernel + t2_compute_kernel-t1_compute_kernel kernel_flops = kernel_flops + 4 * int(nl, 8) * int(ncols, 8) * int(nbw,8) kernel_time = kernel_time + mpi_wtime() - ttt n_times =n_times +1 end subroutine end subroutine