diff --git a/src/elpa2/elpa2_bandred_template.F90 b/src/elpa2/elpa2_bandred_template.F90 index 6d50d47e081fbb7e64fd13c1b8d807fe44d049a0..5f23a2b59fe536ffa3070ccf0b9efac35912614a 100644 --- a/src/elpa2/elpa2_bandred_template.F90 +++ b/src/elpa2/elpa2_bandred_template.F90 @@ -60,7 +60,7 @@ ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". #endif - subroutine bandred_& + subroutine bandred_& &MATH_DATATYPE& &_& &PRECISION & @@ -100,1090 +100,1092 @@ ! !------------------------------------------------------------------------------- - use cuda_functions - use iso_c_binding - use elpa1_compute + use cuda_functions + use iso_c_binding + use elpa1_compute #ifdef WITH_OPENMP - use omp_lib + use omp_lib #endif - use precision - use elpa_blas_interfaces + use precision + use elpa_blas_interfaces #ifdef WITH_MPI - use elpa_scalapack_interfaces + use elpa_scalapack_interfaces #endif - use elpa_abstract_impl + use elpa_abstract_impl - implicit none + implicit none #include "../general/precision_kinds.F90" - class(elpa_abstract_impl_t), intent(inout) :: obj - integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols + class(elpa_abstract_impl_t), intent(inout) :: obj + integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols #ifdef USE_ASSUMED_SIZE - MATH_DATATYPE(kind=rck) :: a_mat(lda,*) - MATH_DATATYPE(kind=rck) :: tmat(nbw,nbw,*) + MATH_DATATYPE(kind=rck) :: a_mat(lda,*) + MATH_DATATYPE(kind=rck) :: tmat(nbw,nbw,*) #else - MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols) - MATH_DATATYPE(kind=rck) :: tmat(nbw,nbw,numBlocks) + MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols) + MATH_DATATYPE(kind=rck) :: tmat(nbw,nbw,numBlocks) #endif #if REALCASE == 1 - real(kind=rk) :: eps + real(kind=rk) :: eps #endif - logical, intent(in) :: useGPU - integer(kind=c_int) :: skewsymmetric - logical :: isSkewsymmetric - character(20) :: gpuString - - integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols - integer(kind=MPI_KIND) :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI - integer(kind=ik) :: l_cols, l_rows + logical, intent(in) :: useGPU + integer(kind=c_int) :: skewsymmetric + logical :: isSkewsymmetric + character(20) :: gpuString + + integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols + integer(kind=MPI_KIND) :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI + integer(kind=ik) :: l_cols, l_rows #if REALCASE == 1 - integer(kind=ik) :: vmrCols + integer(kind=ik) :: vmrCols #endif #ifdef WITH_OPENMP - integer(kind=ik) :: mynlc, lrs, transformChunkSize + integer(kind=ik) :: mynlc, lrs, transformChunkSize #endif - integer(kind=ik) :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow - integer(kind=ik) :: istep, ncol, lch, lcx, nlc - integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile + integer(kind=ik) :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow + integer(kind=ik) :: istep, ncol, lch, lcx, nlc + integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile - real(kind=rk) :: vnorm2 - MATH_DATATYPE(kind=rck) :: xf, aux1(nbw), aux2(nbw), vrl, tau - MATH_DATATYPE(kind=rck) :: vav(nbw,nbw) + real(kind=rk) :: vnorm2 + MATH_DATATYPE(kind=rck) :: xf, aux1(nbw), aux2(nbw), vrl, tau + MATH_DATATYPE(kind=rck) :: vav(nbw,nbw) - MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:) - MATH_DATATYPE(kind=rck), pointer :: vmrCUDA(:), umcCUDA(:) - MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:) - MATH_DATATYPE(kind=rck), allocatable :: vr(:) + MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:) + MATH_DATATYPE(kind=rck), pointer :: vmrCUDA(:), umcCUDA(:) + MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:) + MATH_DATATYPE(kind=rck), allocatable :: vr(:) #if REALCASE == 1 - ! needed for blocked QR decomposition - integer(kind=ik) :: PQRPARAM(11), work_size - real(kind=rk) :: dwork_size(1) - real(kind=rk), allocatable :: work_blocked(:), tauvector(:), blockheuristic(:) + ! needed for blocked QR decomposition + integer(kind=ik) :: PQRPARAM(11), work_size + real(kind=rk) :: dwork_size(1) + real(kind=rk), allocatable :: work_blocked(:), tauvector(:), blockheuristic(:) #endif - integer(kind=C_intptr_T) :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev - type(c_ptr) :: vmr_host, umc_host + integer(kind=C_intptr_T) :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev + type(c_ptr) :: vmr_host, umc_host #ifdef WITH_MPI - !integer(kind=ik), external :: numroc -> use elpa_scalapack + !integer(kind=ik), external :: numroc -> use elpa_scalapack #endif - integer(kind=ik) :: ierr - integer(kind=ik) :: cur_l_rows, cur_l_cols, vmr_size, umc_size - integer(kind=ik) :: l_rows2, vmr_size2, umc_size2 - integer(kind=c_intptr_t) :: lc_start, lc_end + integer(kind=ik) :: ierr + integer(kind=ik) :: cur_l_rows, cur_l_cols, vmr_size, umc_size + integer(kind=ik) :: l_rows2, vmr_size2, umc_size2 + integer(kind=c_intptr_t) :: lc_start, lc_end #if COMPLEXCASE == 1 - integer(kind=c_intptr_t) :: lce_1, lcs_1, lre_1 + integer(kind=c_intptr_t) :: lce_1, lcs_1, lre_1 #endif - integer(kind=ik) :: lr_end - integer(kind=ik) :: na_cols - integer(kind=BLAS_KIND) :: na_colsBLAS + integer(kind=ik) :: lr_end + integer(kind=ik) :: na_cols + integer(kind=BLAS_KIND) :: na_colsBLAS #if COMPLEXCASE == 1 - integer(kind=ik) :: na_rows - integer(kind=BLAS_KIND) :: na_rowsBLAS + integer(kind=ik) :: na_rows + integer(kind=BLAS_KIND) :: na_rowsBLAS #endif - logical, intent(in) :: wantDebug - logical, intent(out) :: success - logical :: successCUDA - integer(kind=ik) :: istat - character(200) :: errorMessage - integer(kind=ik) :: min_tile_size, error + logical, intent(in) :: wantDebug + logical, intent(out) :: success + logical :: successCUDA + integer(kind=ik) :: istat + character(200) :: errorMessage + integer(kind=ik) :: min_tile_size, error #if REALCASE == 1 - logical, intent(in) :: useQR + logical, intent(in) :: useQR #endif - integer(kind=ik) :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, & - ii, pp - integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_& - &PRECISION& - &_& - &MATH_DATATYPE - - logical :: useGPU_reduction_lower_block_to_tridiagonal - integer(kind=ik), intent(in) :: max_threads - logical :: do_memcpy - integer(kind=ik) :: i_blk,blk_off - - call obj%get("is_skewsymmetric",skewsymmetric,error) - if (error .ne. ELPA_OK) then - print *,"Problem getting option for skewsymmetric settings. Aborting..." - stop - endif - isSkewsymmetric = (skewsymmetric == 1) - - if(useGPU) then - gpuString = "_gpu" - else - gpuString = "" - endif - - call obj%timer%start("bandred_& - &MATH_DATATYPE& - &" // & - PRECISION_SUFFIX // & - gpuString ) - - useGPU_reduction_lower_block_to_tridiagonal = .false. - - if (useGPU) then - useGPU_reduction_lower_block_to_tridiagonal = .true. + integer(kind=ik) :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, & + ii, pp + integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_& + &PRECISION& + &_& + &MATH_DATATYPE + + logical :: useGPU_reduction_lower_block_to_tridiagonal + integer(kind=ik), intent(in) :: max_threads + logical :: do_memcpy + integer(kind=ik) :: i_blk,blk_off, blk_end + + call obj%get("is_skewsymmetric",skewsymmetric,error) + if (error .ne. ELPA_OK) then + print *,"Problem getting option for skewsymmetric settings. Aborting..." + stop + endif + isSkewsymmetric = (skewsymmetric == 1) + + if(useGPU) then + gpuString = "_gpu" + else + gpuString = "" + endif + + call obj%timer%start("bandred_& + &MATH_DATATYPE& + &" // & + PRECISION_SUFFIX // & + gpuString ) + + useGPU_reduction_lower_block_to_tridiagonal = .false. + + if (useGPU) then + useGPU_reduction_lower_block_to_tridiagonal = .true. #if REALCASE == 1 - if (useQR) then - !in this case switch off GPU usage for step "reduce current block to lower triangular form" - ! since this is done by QR decomposition - useGPU_reduction_lower_block_to_tridiagonal = .false. - endif + if (useQR) then + !in this case switch off GPU usage for step "reduce current block to lower triangular form" + ! since this is done by QR decomposition + useGPU_reduction_lower_block_to_tridiagonal = .false. + endif #endif + endif + + if (wantDebug) call obj%timer%start("mpi_communication") + + call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI ,mpierr) + call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI ,mpierr) + call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI ,mpierr) + call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI ,mpierr) + + my_prow = int(my_prowMPI,kind=c_int) + np_rows = int(np_rowsMPI,kind=c_int) + my_pcol = int(my_pcolMPI,kind=c_int) + np_cols = int(np_colsMPI,kind=c_int) + + if (wantDebug) call obj%timer%stop("mpi_communication") + success = .true. + + + ! Semibandwith nbw must be a multiple of blocksize nblk + if (mod(nbw,nblk)/=0) then + if (my_prow==0 .and. my_pcol==0) then + if (wantDebug) then + write(error_unit,*) 'ELPA2_bandred_& + &MATH_DATATYPE& + &: ERROR: nbw=',nbw,', nblk=',nblk + write(error_unit,*) 'ELPA2_bandred_& + &MATH_DATATYPE& + &: ELPA2 works only for nbw==n*nblk' endif + success = .false. + return + endif + endif - if (wantDebug) call obj%timer%start("mpi_communication") - - call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI ,mpierr) - call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI ,mpierr) - call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI ,mpierr) - call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI ,mpierr) - - my_prow = int(my_prowMPI,kind=c_int) - np_rows = int(np_rowsMPI,kind=c_int) - my_pcol = int(my_pcolMPI,kind=c_int) - np_cols = int(np_colsMPI,kind=c_int) - - if (wantDebug) call obj%timer%stop("mpi_communication") - success = .true. - - - ! Semibandwith nbw must be a multiple of blocksize nblk - if (mod(nbw,nblk)/=0) then - if (my_prow==0 .and. my_pcol==0) then - if (wantDebug) then - write(error_unit,*) 'ELPA2_bandred_& - &MATH_DATATYPE& - &: ERROR: nbw=',nbw,', nblk=',nblk - write(error_unit,*) 'ELPA2_bandred_& - &MATH_DATATYPE& - &: ELPA2 works only for nbw==n*nblk' - endif - success = .false. - return - endif - endif - - ! na_rows in used nowhere; only na_cols - if (useGPU) then + ! na_rows in used nowhere; only na_cols + if (useGPU) then #ifdef WITH_MPI #if COMPLEXCASE == 1 - na_rowsBLAS = numroc(int(na,kind=BLAS_KIND), int(nblk,kind=BLAS_KIND), & + na_rowsBLAS = numroc(int(na,kind=BLAS_KIND), int(nblk,kind=BLAS_KIND), & int(my_prow,kind=BLAS_KIND), 0_BLAS_KIND, int(np_rows,kind=BLAS_KIND)) - na_rows = int(na_rowsBLAS,kind=c_int) + na_rows = int(na_rowsBLAS,kind=c_int) #endif - na_colsBLAS = numroc(int(na,kind=BLAS_KIND), int(nblk,kind=BLAS_KIND), & + na_colsBLAS = numroc(int(na,kind=BLAS_KIND), int(nblk,kind=BLAS_KIND), & int(my_pcol,kind=BLAS_KIND), 0_BLAS_KIND, int(np_cols,kind=BLAS_KIND)) - na_cols = int(na_colsBLAS,kind=c_int) + na_cols = int(na_colsBLAS,kind=c_int) #else #if COMPLEXCASE == 1 - na_rows = na + na_rows = na #endif - na_cols = na + na_cols = na #endif /* WITH_MPI */ - ! Here we convert the regular host array into a pinned host array - successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype) - check_alloc_cuda("bandred: a_dev", successCUDA) - - successCUDA = cuda_host_register(int(loc(vav),kind=c_intptr_t), & - nbw * nbw * size_of_datatype,& - cudaHostRegisterDefault) - check_host_register_cuda("bandred: vav", successCUDA) - - successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype) - check_alloc_cuda("bandred: vav_dev", successCUDA) - endif ! useGPU - - ! Matrix is split into tiles; work is done only for tiles on the diagonal or above - - tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size - - ! make tile_size a smallest possible multiple of previously defined tile size, such that it is - ! larger or equal to min_tile_size - ! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value - ! it can, however, be set by the user - call obj%get("min_tile_size", min_tile_size ,error) - if (error .ne. ELPA_OK) then - print *,"Problem setting option for min_tile_size. Aborting..." - stop - endif - if(min_tile_size == 0) then - ! not set by the user, use the default value - min_tile_size = 128*max(np_rows, np_cols) - endif - tile_size = ((min_tile_size-1)/tile_size+1)*tile_size - - l_rows_tile = tile_size/np_rows ! local rows of a tile - l_cols_tile = tile_size/np_cols ! local cols of a tile + ! Here we convert the regular host array into a pinned host array + successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype) + check_alloc_cuda("bandred: a_dev", successCUDA) + + successCUDA = cuda_host_register(int(loc(vav),kind=c_intptr_t), & + nbw * nbw * size_of_datatype,& + cudaHostRegisterDefault) + check_host_register_cuda("bandred: vav", successCUDA) + + successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype) + check_alloc_cuda("bandred: vav_dev", successCUDA) + endif ! useGPU + + ! Matrix is split into tiles; work is done only for tiles on the diagonal or above + + tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size + + ! make tile_size a smallest possible multiple of previously defined tile size, such that it is + ! larger or equal to min_tile_size + ! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value + ! it can, however, be set by the user + call obj%get("min_tile_size", min_tile_size ,error) + if (error .ne. ELPA_OK) then + print *,"Problem setting option for min_tile_size. Aborting..." + stop + endif + if(min_tile_size == 0) then + ! not set by the user, use the default value + min_tile_size = 128*max(np_rows, np_cols) + endif + tile_size = ((min_tile_size-1)/tile_size+1)*tile_size + + l_rows_tile = tile_size/np_rows ! local rows of a tile + l_cols_tile = tile_size/np_cols ! local cols of a tile #if REALCASE == 1 - if (useQR) then + if (useQR) then - if (which_qr_decomposition == 1) then - call qr_pqrparam_init(obj,pqrparam(1:11), nblk,'M',0, nblk,'M',0, nblk,'M',1,'s') - allocate(tauvector(na), stat=istat, errmsg=errorMessage) - check_allocate("bandred: tauvector", istat, errorMessage) + if (which_qr_decomposition == 1) then + call qr_pqrparam_init(obj,pqrparam(1:11), nblk,'M',0, nblk,'M',0, nblk,'M',1,'s') + allocate(tauvector(na), stat=istat, errmsg=errorMessage) + check_allocate("bandred: tauvector", istat, errorMessage) - allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage) - check_allocate("bandred: blockheuristic", istat, errorMessage) + allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage) + check_allocate("bandred: blockheuristic", istat, errorMessage) - l_rows = local_index(na, my_prow, np_rows, nblk, -1) - allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage) - check_allocate("bandred: vmrCPU", istat, errorMessage) + l_rows = local_index(na, my_prow, np_rows, nblk, -1) + allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage) + check_allocate("bandred: vmrCPU", istat, errorMessage) - vmrCols = na + vmrCols = na #ifdef USE_ASSUMED_SIZE_QR - call qr_pdgeqrf_2dcomm_& - &PRECISION& - &(obj, a_mat, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), & - nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), & - mpi_comm_rows, mpi_comm_cols, blockheuristic) + call qr_pdgeqrf_2dcomm_& + &PRECISION& + &(obj, a_mat, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), & + nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), & + mpi_comm_rows, mpi_comm_cols, blockheuristic) #else - call qr_pdgeqrf_2dcomm_& - &PRECISION& - &(obj, a_mat(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), & - vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, & - nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), & - mpi_comm_rows, mpi_comm_cols, blockheuristic) + call qr_pdgeqrf_2dcomm_& + &PRECISION& + &(obj, a_mat(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), & + vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, & + nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), & + mpi_comm_rows, mpi_comm_cols, blockheuristic) #endif - work_size = int(dwork_size(1)) - allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage) - check_allocate("bandred: work_blocked", istat, errorMessage) - work_blocked = 0.0_rk - deallocate(vmrCPU, stat=istat, errmsg=errorMessage) - check_deallocate("bandred: vmrCPU", istat, errorMessage) + work_size = int(dwork_size(1)) + allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage) + check_allocate("bandred: work_blocked", istat, errorMessage) + work_blocked = 0.0_rk + deallocate(vmrCPU, stat=istat, errmsg=errorMessage) + check_deallocate("bandred: vmrCPU", istat, errorMessage) - endif ! which_qr_decomposition + endif ! which_qr_decomposition - endif ! useQr + endif ! useQr #endif /* REALCASE */ - if (useGPU) then - - successCUDA = cuda_host_register(int(loc(a_mat),kind=c_intptr_t), & - lda*na_cols*size_of_datatype, cudaHostRegisterDefault) - check_host_register_cuda("bandred: a_mat", successCUDA) - - cur_l_rows = 0 - cur_l_cols = 0 - - successCUDA = cuda_memcpy(a_dev, int(loc(a_mat),kind=c_intptr_t), & - lda*na_cols*size_of_datatype, cudaMemcpyHostToDevice) - check_memcpy_cuda("bandred: a_dev", successCUDA) - - successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_datatype) - check_alloc_cuda("bandred: tmat_dev", successCUDA) - - istep = (na-1)/nbw - n_cols = min(na,(istep+1)*nbw)-istep*nbw - l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1) - l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1) - cur_l_rows = max(l_rows,1) - cur_l_cols = max(l_cols,1) - vmr_size = cur_l_rows*2*n_cols - umc_size = cur_l_cols*2*n_cols - - istep = (na-1)/nbw - 1 - n_cols = min(na,(istep+1)*nbw)-istep*nbw - l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1) - l_rows2 = local_index(istep*nbw, my_prow, np_rows, nblk, -1) - cur_l_rows = max(l_rows2,1) - cur_l_cols = max(l_cols,1) - vmr_size2 = cur_l_rows*2*n_cols - umc_size2 = cur_l_cols*2*n_cols - - l_rows = max(l_rows,l_rows2) - vmr_size = max(vmr_size,vmr_size2) - umc_size = max(umc_size,umc_size2) - - allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_& - &MATH_DATATYPE& - &: error when allocating vr "//errorMessage - stop 1 - endif + blk_end = (na-1)/nbw + if (useGPU) then + + successCUDA = cuda_host_register(int(loc(a_mat),kind=c_intptr_t), & + lda*na_cols*size_of_datatype, cudaHostRegisterDefault) + check_host_register_cuda("bandred: a_mat", successCUDA) + + cur_l_rows = 0 + cur_l_cols = 0 + + successCUDA = cuda_memcpy(a_dev, int(loc(a_mat),kind=c_intptr_t), & + lda*na_cols*size_of_datatype, cudaMemcpyHostToDevice) + check_memcpy_cuda("bandred: a_dev", successCUDA) + + successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_datatype) + check_alloc_cuda("bandred: tmat_dev", successCUDA) + + istep = (na-1)/nbw + blk_end = (na-1)/nbw + n_cols = min(na,(istep+1)*nbw)-istep*nbw + l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1) + l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1) + cur_l_rows = max(l_rows,1) + cur_l_cols = max(l_cols,1) + vmr_size = cur_l_rows*2*n_cols + umc_size = cur_l_cols*2*n_cols + + istep = (na-1)/nbw - 1 + n_cols = min(na,(istep+1)*nbw)-istep*nbw + l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1) + l_rows2 = local_index(istep*nbw, my_prow, np_rows, nblk, -1) + cur_l_rows = max(l_rows2,1) + cur_l_cols = max(l_cols,1) + vmr_size2 = cur_l_rows*2*n_cols + umc_size2 = cur_l_cols*2*n_cols + + l_rows = max(l_rows,l_rows2) + vmr_size = max(vmr_size,vmr_size2) + umc_size = max(umc_size,umc_size2) + + allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"bandred_& + &MATH_DATATYPE& + &: error when allocating vr "//errorMessage + stop 1 + endif - successCUDA = cuda_malloc_host(vmr_host,vmr_size*size_of_datatype) - check_host_alloc_cuda("bandred: vmr_host", successCUDA) - call c_f_pointer(vmr_host, vmrCUDA, (/vmr_size/)) + successCUDA = cuda_malloc_host(vmr_host,vmr_size*size_of_datatype) + check_host_alloc_cuda("bandred: vmr_host", successCUDA) + call c_f_pointer(vmr_host, vmrCUDA, (/vmr_size/)) - successCUDA = cuda_malloc(vmr_dev, vmr_size*size_of_datatype) - check_alloc_cuda("bandred: vmr_dev", successCUDA) + successCUDA = cuda_malloc(vmr_dev, vmr_size*size_of_datatype) + check_alloc_cuda("bandred: vmr_dev", successCUDA) - successCUDA = cuda_malloc_host(umc_host,umc_size*size_of_datatype) - check_host_alloc_cuda("bandred: umc_host", successCUDA) - call c_f_pointer(umc_host, umcCUDA, (/umc_size/)) + successCUDA = cuda_malloc_host(umc_host,umc_size*size_of_datatype) + check_host_alloc_cuda("bandred: umc_host", successCUDA) + call c_f_pointer(umc_host, umcCUDA, (/umc_size/)) - successCUDA = cuda_malloc(umc_dev, umc_size*size_of_datatype) - check_alloc_cuda("bandred: umc_dev", successCUDA) + successCUDA = cuda_malloc(umc_dev, umc_size*size_of_datatype) + check_alloc_cuda("bandred: umc_dev", successCUDA) - endif ! useGPU + endif ! useGPU - do istep = (na-1)/nbw, 1, -1 + do istep = blk_end, 1, -1 - n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step + n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step - ! Number of local columns/rows of remaining matrix - l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1) - l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1) + ! Number of local columns/rows of remaining matrix + l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1) + l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1) - ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces + ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces - if (useGPU) then - cur_l_rows = max(l_rows, 1) - cur_l_cols = max(l_cols, 1) - vmr_size = cur_l_rows * 2 * n_cols - umc_size = cur_l_cols * 2 * n_cols + if (useGPU) then + cur_l_rows = max(l_rows, 1) + cur_l_cols = max(l_cols, 1) + vmr_size = cur_l_rows * 2 * n_cols + umc_size = cur_l_cols * 2 * n_cols - else ! GPU not used + else ! GPU not used - ! unify the the name vmr and vmrCPU, as well as vmrGPU - ! the same for umcCPU and umcGPU - ! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces + ! unify the the name vmr and vmrCPU, as well as vmrGPU + ! the same for umcCPU and umcGPU + ! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces - allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage) - check_allocate("bandred: vmrCPU", istat, errorMessage) + allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage) + check_allocate("bandred: vmrCPU", istat, errorMessage) - allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage) - check_allocate("bandred: umcCPU", istat, errorMessage) + allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage) + check_allocate("bandred: umcCPU", istat, errorMessage) - allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage) - check_allocate("bandred: vr", istat, errorMessage) + allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage) + check_allocate("bandred: vr", istat, errorMessage) - endif ! use GPU + endif ! use GPU - if (useGPU) then - vmrCUDA(1 : cur_l_rows * n_cols) = 0.0_rck - umcCUDA(1 : umc_size) = 0.0_rck - else - vmrCPU(1:l_rows,1:n_cols) = 0.0_rck - endif ! useGPU + if (useGPU) then + vmrCUDA(1 : cur_l_rows * n_cols) = 0.0_rck + umcCUDA(1 : umc_size) = 0.0_rck + else + vmrCPU(1:l_rows,1:n_cols) = 0.0_rck + endif ! useGPU - vr(:) = 0.0_rck - tmat(:,:,istep) = 0.0_rck - if (useGPU) then - lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1) - lc_end = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1) - lr_end = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1) + vr(:) = 0.0_rck + tmat(:,:,istep) = 0.0_rck + if (useGPU) then + lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1) + lc_end = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1) + lr_end = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1) - if (lc_start .le. 0) lc_start = 1 + if (lc_start .le. 0) lc_start = 1 - do_memcpy = .false. + do_memcpy = .false. - ! Note: mod(nbw,nblk) == 0 - do i_blk = 1, nbw/nblk - blk_off = (i_blk-1) * nblk - cur_pcol = pcol(istep*nbw+1+blk_off, nblk, np_cols) + ! Note: mod(nbw,nblk) == 0 + do i_blk = 1, nbw/nblk + blk_off = (i_blk-1) * nblk + cur_pcol = pcol(istep*nbw+1+blk_off, nblk, np_cols) - if (my_pcol == cur_pcol) then - do_memcpy = .true. - endif - enddo + if (my_pcol == cur_pcol) then + do_memcpy = .true. + endif + enddo - if (do_memcpy) then - successCUDA = cuda_memcpy2d(int(loc(a_mat(1, lc_start)),kind=c_intptr_t), & - int((lda*size_of_datatype),kind=c_intptr_t), & - (a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )), & - int(lda*size_of_datatype,kind=c_intptr_t), & - int(lr_end*size_of_datatype,kind=c_intptr_t), & - int((lc_end - lc_start+1),kind=c_intptr_t),int(cudaMemcpyDeviceToHost,kind=c_int)) + if (do_memcpy) then + successCUDA = cuda_memcpy2d(int(loc(a_mat(1, lc_start)),kind=c_intptr_t), & + int((lda*size_of_datatype),kind=c_intptr_t), & + (a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )), & + int(lda*size_of_datatype,kind=c_intptr_t), & + int(lr_end*size_of_datatype,kind=c_intptr_t), & + int((lc_end - lc_start+1),kind=c_intptr_t),int(cudaMemcpyDeviceToHost,kind=c_int)) - check_memcpy_cuda("bandred: a_dev -> a_mat", successCUDA) - endif - endif ! useGPU + check_memcpy_cuda("bandred: a_dev -> a_mat", successCUDA) + endif + endif ! useGPU - ! Reduce current block to lower triangular form + ! Reduce current block to lower triangular form #if REALCASE == 1 - if (useQR) then - if (which_qr_decomposition == 1) then - vmrCols = 2*n_cols + if (useQR) then + if (which_qr_decomposition == 1) then + vmrCols = 2*n_cols #ifdef USE_ASSUMED_SIZE_QR - call qr_pdgeqrf_2dcomm_& - &PRECISION& - &(obj, a_mat, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), & - na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size, & - work_size, na, n_cols, nblk, nblk, & - istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,& - 0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,& - blockheuristic) + call qr_pdgeqrf_2dcomm_& + &PRECISION& + &(obj, a_mat, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), & + na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size, & + work_size, na, n_cols, nblk, nblk, & + istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,& + 0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,& + blockheuristic) #else - call qr_pdgeqrf_2dcomm_& - &PRECISION& - &(obj, a_mat(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) , & - max(l_rows,1), vmrCols, tauvector(1:na), na, & - tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, & - work_size, na, n_cols, nblk, nblk, & - istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,& - 0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,& - blockheuristic) + call qr_pdgeqrf_2dcomm_& + &PRECISION& + &(obj, a_mat(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) , & + max(l_rows,1), vmrCols, tauvector(1:na), na, & + tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, & + work_size, na, n_cols, nblk, nblk, & + istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,& + 0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,& + blockheuristic) #endif - endif + endif - else !useQR + else !useQR #endif /* REALCASE == 1 */ - do lc = n_cols, 1, -1 + do lc = n_cols, 1, -1 - ncol = istep*nbw + lc ! absolute column number of householder Vector - nrow = ncol - nbw ! Absolute number of pivot row + ncol = istep*nbw + lc ! absolute column number of householder Vector + nrow = ncol - nbw ! Absolute number of pivot row - lr = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length - lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number + lr = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length + lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number - tau = 0 + tau = 0 - if (nrow == 1) exit ! Nothing to do + if (nrow == 1) exit ! Nothing to do - cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block + cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block - if (my_pcol==cur_pcol) then + if (my_pcol==cur_pcol) then - ! Get Vector to be transformed; distribute last element and norm of - ! remaining elements to all procs in current column + ! Get Vector to be transformed; distribute last element and norm of + ! remaining elements to all procs in current column - vr(1:lr) = a_mat(1:lr,lch) ! Vector to be transformed + vr(1:lr) = a_mat(1:lr,lch) ! Vector to be transformed - if (my_prow==prow(nrow, nblk, np_rows)) then - aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1)) - aux1(2) = vr(lr) - else - aux1(1) = dot_product(vr(1:lr),vr(1:lr)) - aux1(2) = 0.0_rck - endif + if (my_prow==prow(nrow, nblk, np_rows)) then + aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1)) + aux1(2) = vr(lr) + else + aux1(1) = dot_product(vr(1:lr),vr(1:lr)) + aux1(2) = 0.0_rck + endif #ifdef WITH_MPI - if (wantDebug) call obj%timer%start("mpi_communication") - call mpi_allreduce(aux1, aux2, 2_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, & - MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr) - if (wantDebug) call obj%timer%stop("mpi_communication") + if (wantDebug) call obj%timer%start("mpi_communication") + call mpi_allreduce(aux1, aux2, 2_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, & + MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr) + if (wantDebug) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ - aux2 = aux1 ! this should be optimized + aux2 = aux1 ! this should be optimized #endif #if REALCASE == 1 - vnorm2 = aux2(1) + vnorm2 = aux2(1) #endif #if COMPLEXCASE == 1 - vnorm2 = real(aux2(1),kind=rk) + vnorm2 = real(aux2(1),kind=rk) #endif - vrl = aux2(2) + vrl = aux2(2) - ! Householder transformation - call hh_transform_& + ! Householder transformation + call hh_transform_& &MATH_DATATYPE& &_& &PRECISION & (obj, vrl, vnorm2, xf, tau, wantDebug) - ! Scale vr and store Householder Vector for back transformation - - vr(1:lr) = vr(1:lr) * xf - if (my_prow==prow(nrow, nblk, np_rows)) then - a_mat(1:lr-1,lch) = vr(1:lr-1) - a_mat(lr,lch) = vrl - vr(lr) = 1.0_rck - else - a_mat(1:lr,lch) = vr(1:lr) - endif + ! Scale vr and store Householder Vector for back transformation + + vr(1:lr) = vr(1:lr) * xf + if (my_prow==prow(nrow, nblk, np_rows)) then + a_mat(1:lr-1,lch) = vr(1:lr-1) + a_mat(lr,lch) = vrl + vr(lr) = 1.0_rck + else + a_mat(1:lr,lch) = vr(1:lr) + endif - endif + endif - ! Broadcast Householder Vector and tau along columns + ! Broadcast Householder Vector and tau along columns - vr(lr+1) = tau + vr(lr+1) = tau #ifdef WITH_MPI - if (wantDebug) call obj%timer%start("mpi_communication") - call MPI_Bcast(vr, int(lr+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, & - int(cur_pcol,kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr) - if (wantDebug) call obj%timer%stop("mpi_communication") + if (wantDebug) call obj%timer%start("mpi_communication") + call MPI_Bcast(vr, int(lr+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, & + int(cur_pcol,kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr) + if (wantDebug) call obj%timer%stop("mpi_communication") #endif /* WITH_MPI */ - if (useGPU_reduction_lower_block_to_tridiagonal) then - vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr) - else - vmrCPU(1:lr,lc) = vr(1:lr) - endif - tau = vr(lr+1) + if (useGPU_reduction_lower_block_to_tridiagonal) then + vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr) + else + vmrCPU(1:lr,lc) = vr(1:lr) + endif + tau = vr(lr+1) #if REALCASE == 1 - tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat + tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat #endif #if COMPLEXCASE == 1 - tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat + tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat #endif - ! Transform remaining columns in current block with Householder Vector - ! Local dot product + ! Transform remaining columns in current block with Householder Vector + ! Local dot product - aux1 = 0.0_rck + aux1 = 0.0_rck #ifdef WITH_OPENMP #if 0 ! original complex implementation without openmp. check performance - nlc = 0 ! number of local columns - do j=1,lc-1 - lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) - if (lcx>0) then - nlc = nlc+1 - aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx)) - endif - enddo - - ! Get global dot products + nlc = 0 ! number of local columns + do j=1,lc-1 + lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) + if (lcx>0) then + nlc = nlc+1 + aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx)) + endif + enddo + + ! Get global dot products #ifdef WITH_MPI - if (wantDebug) call obj%timer%start("mpi_communication") - if (nlc>0) call mpi_allreduce(aux1, aux2, int(nlc,kind=MPI_KIND), MPI_COMPLEX_PRECISION, MPI_SUM, & + if (wantDebug) call obj%timer%start("mpi_communication") + if (nlc>0) call mpi_allreduce(aux1, aux2, int(nlc,kind=MPI_KIND), MPI_COMPLEX_PRECISION, MPI_SUM, & int(mpi_comm_rows,kind=MPI_KIND), mpierr) - ! Transform + ! Transform - nlc = 0 - do j=1,lc-1 - lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) - if (lcx>0) then - nlc = nlc+1 - a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr) + nlc = 0 + do j=1,lc-1 + lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) + if (lcx>0) then + nlc = nlc+1 + a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr) - endif - enddo + endif + enddo - if (wantDebug) call obj%timer%stop("mpi_communication") + if (wantDebug) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ - ! Transform + ! Transform - nlc = 0 - do j=1,lc-1 - lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) - if (lcx>0) then - nlc = nlc+1 - a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr) - endif - enddo + nlc = 0 + do j=1,lc-1 + lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) + if (lcx>0) then + nlc = nlc+1 + a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr) + endif + enddo #endif /* WITH_MPI */ ! -! ! Transform +! ! Transform ! -! nlc = 0 -! do j=1,lc-1 -! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) -! if (lcx>0) then -! nlc = nlc+1 -! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr) - -! endif -! enddo +! nlc = 0 +! do j=1,lc-1 +! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) +! if (lcx>0) then +! nlc = nlc+1 +! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr) + +! endif +! enddo #endif /* if 0 */ - !Open up one omp region to avoid paying openmp overhead. - !This does not help performance due to the addition of two openmp barriers around the MPI call, - !But in the future this may be beneficial if these barriers are replaced with a faster implementation - - !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1) - mynlc = 0 ! number of local columns - - !This loop does not have independent iterations, - !'mynlc' is incremented each iteration, and it is difficult to remove this dependency - !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration - !That is, a thread only executes the work associated with an iteration if its thread id is congruent to - !the iteration number modulo the number of threads - do j=1,lc-1 - lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) - if (lcx>0 ) then - mynlc = mynlc+1 - if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then - if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx)) - endif - endif - enddo - - ! Get global dot products - - !$omp barrier - !$omp single + !Open up one omp region to avoid paying openmp overhead. + !This does not help performance due to the addition of two openmp barriers around the MPI call, + !But in the future this may be beneficial if these barriers are replaced with a faster implementation + + !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1) + mynlc = 0 ! number of local columns + + !This loop does not have independent iterations, + !'mynlc' is incremented each iteration, and it is difficult to remove this dependency + !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration + !That is, a thread only executes the work associated with an iteration if its thread id is congruent to + !the iteration number modulo the number of threads + do j=1,lc-1 + lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) + if (lcx>0 ) then + mynlc = mynlc+1 + if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then + if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx)) + endif + endif + enddo + + ! Get global dot products + + !$omp barrier + !$omp single #ifdef WITH_MPI - if (wantDebug) call obj%timer%start("mpi_communication") - if (mynlc>0) call mpi_allreduce(aux1, aux2, int(mynlc,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, & - MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr) - if (wantDebug) call obj%timer%stop("mpi_communication") + if (wantDebug) call obj%timer%start("mpi_communication") + if (mynlc>0) call mpi_allreduce(aux1, aux2, int(mynlc,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, & + MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr) + if (wantDebug) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ - if (mynlc>0) aux2 = aux1 + if (mynlc>0) aux2 = aux1 #endif /* WITH_MPI */ - !$omp end single - !$omp barrier - - ! Transform - transformChunkSize=32 - mynlc = 0 - do j=1,lc-1 - lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) - if (lcx>0) then - mynlc = mynlc+1 - !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32 - !However, for some reason this is slower than doing it manually, so it is parallelized as below. - do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize - do pp = 1,transformChunkSize - if (pp + ii > lr) exit + !$omp end single + !$omp barrier + + ! Transform + transformChunkSize=32 + mynlc = 0 + do j=1,lc-1 + lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) + if (lcx>0) then + mynlc = mynlc+1 + !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32 + !However, for some reason this is slower than doing it manually, so it is parallelized as below. + do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize + do pp = 1,transformChunkSize + if (pp + ii > lr) exit #if REALCASE == 1 - a_mat(ii+pp,lcx) = a_mat(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp) + a_mat(ii+pp,lcx) = a_mat(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp) #endif #if COMPLEXCASE == 1 - a_mat(ii+pp,lcx) = a_mat(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp) + a_mat(ii+pp,lcx) = a_mat(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp) #endif - enddo - enddo - endif - enddo - !$omp end parallel + enddo + enddo + endif + enddo + !$omp end parallel #else /* WITH_OPENMP */ - nlc = 0 ! number of local columns - do j=1,lc-1 - lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) - if (lcx>0) then - nlc = nlc+1 - if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx)) - endif - enddo + nlc = 0 ! number of local columns + do j=1,lc-1 + lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) + if (lcx>0) then + nlc = nlc+1 + if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx)) + endif + enddo - ! Get global dot products + ! Get global dot products #ifdef WITH_MPI - if (wantDebug) call obj%timer%start("mpi_communication") - if (nlc>0) call mpi_allreduce(aux1, aux2, int(nlc,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, & - MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr) - if (wantDebug) call obj%timer%stop("mpi_communication") + if (wantDebug) call obj%timer%start("mpi_communication") + if (nlc>0) call mpi_allreduce(aux1, aux2, int(nlc,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, & + MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr) + if (wantDebug) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ - if (nlc>0) aux2=aux1 + if (nlc>0) aux2=aux1 #endif /* WITH_MPI */ - ! Transform + ! Transform - nlc = 0 - do j=1,lc-1 - lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) - if (lcx>0) then - nlc = nlc+1 + nlc = 0 + do j=1,lc-1 + lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) + if (lcx>0) then + nlc = nlc+1 #if REALCASE == 1 - a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr) + a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr) #endif #if COMPLEXCASE == 1 - a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr) + a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr) #endif - endif - enddo + endif + enddo #endif /* WITH_OPENMP */ - enddo ! lc - - if (useGPU_reduction_lower_block_to_tridiagonal) then - ! store column tiles back to GPU - if (do_memcpy) then - successCUDA = cuda_memcpy2d((a_dev+ & - int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), & - int(lda*size_of_datatype,kind=c_intptr_t), int(loc(a_mat(1,lc_start)),kind=c_intptr_t), & - int(lda*size_of_datatype,kind=c_intptr_t), & - int(lr_end*size_of_datatype,kind=c_intptr_t), & - int((lc_end - lc_start+1),kind=c_intptr_t), & - int(cudaMemcpyHostToDevice,kind=c_int)) - check_memcpy_cuda("bandred: a_mat -> a_dev", successCUDA) - endif - endif + enddo ! lc + + if (useGPU_reduction_lower_block_to_tridiagonal) then + ! store column tiles back to GPU + if (do_memcpy) then + successCUDA = cuda_memcpy2d((a_dev+ & + int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), & + int(lda*size_of_datatype,kind=c_intptr_t), int(loc(a_mat(1,lc_start)),kind=c_intptr_t), & + int(lda*size_of_datatype,kind=c_intptr_t), & + int(lr_end*size_of_datatype,kind=c_intptr_t), & + int((lc_end - lc_start+1),kind=c_intptr_t), & + int(cudaMemcpyHostToDevice,kind=c_int)) + check_memcpy_cuda("bandred: a_mat -> a_dev", successCUDA) + endif + endif - ! Calculate scalar products of stored Householder vectors. - ! This can be done in different ways, we use dsyrk + ! Calculate scalar products of stored Householder vectors. + ! This can be done in different ways, we use dsyrk - vav = 0 - call obj%timer%start("blas") - if (useGPU_reduction_lower_block_to_tridiagonal) then - if (l_rows>0) & + vav = 0 + call obj%timer%start("blas") + if (useGPU_reduction_lower_block_to_tridiagonal) then + if (l_rows>0) & #if REALCASE == 1 - call PRECISION_SYRK('U', 'T', & + call PRECISION_SYRK('U', 'T', & #endif #if COMPLEXCASE == 1 - call PRECISION_HERK('U', 'C', & + call PRECISION_HERK('U', 'C', & #endif int(n_cols,kind=BLAS_KIND), int(l_rows,kind=BLAS_KIND), ONE, & vmrCUDA, int(cur_l_rows,kind=BLAS_KIND), & ZERO, vav, int(ubound(vav,dim=1),kind=BLAS_KIND)) - else ! useGPU_reduction_to_tridiagonal - if (l_rows>0) & + else ! useGPU_reduction_to_tridiagonal + if (l_rows>0) & #if REALCASE == 1 - call PRECISION_SYRK('U', 'T', & + call PRECISION_SYRK('U', 'T', & #endif #if COMPLEXCASE == 1 - call PRECISION_HERK('U', 'C', & + call PRECISION_HERK('U', 'C', & #endif - int(n_cols,kind=BLAS_KIND), int(l_rows,kind=BLAS_KIND), ONE, vmrCPU, & - int(ubound(vmrCPU,dim=1),kind=BLAS_KIND), ZERO, vav, int(ubound(vav,dim=1),kind=BLAS_KIND)) - endif - call obj%timer%stop("blas") + int(n_cols,kind=BLAS_KIND), int(l_rows,kind=BLAS_KIND), ONE, vmrCPU, & + int(ubound(vmrCPU,dim=1),kind=BLAS_KIND), ZERO, vav, int(ubound(vav,dim=1),kind=BLAS_KIND)) + endif + call obj%timer%stop("blas") #if REALCASE == 1 - call symm_matrix_allreduce_& + call symm_matrix_allreduce_& #endif #if COMPLEXCASE == 1 - call herm_matrix_allreduce_& + call herm_matrix_allreduce_& #endif &PRECISION & (obj, n_cols,vav, nbw, nbw,mpi_comm_rows) ! Calculate triangular matrix T for block Householder Transformation - call obj%timer%start("blas") - do lc=n_cols,1,-1 - tau = tmat(lc,lc,istep) - if (lc a_dev", successCUDA) - endif + if (useGPU .and. useQR) then + ! copy the data for furhter usage + ! qr worked on *CPU arrarys + !vmrCUDA(1:cur_l_rows * n_cols) = vmrCPU(1:cur_l_rows,1:n_cols) + if (do_memcpy) then + successCUDA = cuda_memcpy2d((a_dev+ & + int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), & + int(lda*size_of_datatype,kind=c_intptr_t), int(loc(a_mat(1,lc_start)),kind=c_intptr_t), & + int(lda*size_of_datatype,kind=c_intptr_t), & + int(lr_end*size_of_datatype,kind=c_intptr_t), & + int((lc_end - lc_start+1),kind=c_intptr_t), & + int(cudaMemcpyHostToDevice,kind=c_int)) + check_memcpy_cuda("bandred: a_mat -> a_dev", successCUDA) + endif - endif + endif #endif - ! Transpose vmr -> vmc (stored in umc, second half) - if (useGPU) then - call elpa_transpose_vectors_& - &MATH_DATATYPE& - &_& - &PRECISION & - (obj, vmrCUDA(:), cur_l_rows, mpi_comm_rows, & - umcCUDA(cur_l_cols * n_cols + 1:), cur_l_cols, & - mpi_comm_cols, 1, istep*nbw, n_cols, nblk, max_threads) - else ! useGPU - call elpa_transpose_vectors_& - &MATH_DATATYPE& - &_& - &PRECISION & - (obj, vmrCPU, ubound(vmrCPU,dim=1), mpi_comm_rows, & - umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), mpi_comm_cols, & - 1, istep*nbw, n_cols, nblk, max_threads) - endif + ! Transpose vmr -> vmc (stored in umc, second half) + if (useGPU) then + call elpa_transpose_vectors_& + &MATH_DATATYPE& + &_& + &PRECISION & + (obj, vmrCUDA(:), cur_l_rows, mpi_comm_rows, & + umcCUDA(cur_l_cols * n_cols + 1:), cur_l_cols, & + mpi_comm_cols, 1, istep*nbw, n_cols, nblk, max_threads) + else ! useGPU + call elpa_transpose_vectors_& + &MATH_DATATYPE& + &_& + &PRECISION & + (obj, vmrCPU, ubound(vmrCPU,dim=1), mpi_comm_rows, & + umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), mpi_comm_cols, & + 1, istep*nbw, n_cols, nblk, max_threads) + endif - ! Calculate umc = A**T * vmr - ! Note that the distributed A has to be transposed - ! Opposed to direct tridiagonalization there is no need to use the cache locality - ! of the tiles, so we can use strips of the matrix + ! Calculate umc = A**T * vmr + ! Note that the distributed A has to be transposed + ! Opposed to direct tridiagonalization there is no need to use the cache locality + ! of the tiles, so we can use strips of the matrix #if 0 - ! original complex implemetation check for performance - umcCPU(1:l_cols,1:n_cols) = 0.0_rck - vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck + ! original complex implemetation check for performance + umcCPU(1:l_cols,1:n_cols) = 0.0_rck + vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck - if (l_cols>0 .and. l_rows>0) then - do i=0,(istep*nbw-1)/tile_size + if (l_cols>0 .and. l_rows>0) then + do i=0,(istep*nbw-1)/tile_size - lcs = i*l_cols_tile+1 - lce = min(l_cols,(i+1)*l_cols_tile) - if (lce0 .and. l_rows>0) + endif ! (l_cols>0 .and. l_rows>0) #endif /* if 0 */ - !Code for Algorithm 4 + !Code for Algorithm 4 - ! n_way is actually a branch for the number of OpenMP threads - n_way = 1 + ! n_way is actually a branch for the number of OpenMP threads + n_way = 1 #ifdef WITH_OPENMP #if REALCASE == 1 - n_way = max_threads + n_way = max_threads - !$omp parallel private( i,lcs,lce,lrs,lre) + !$omp parallel private( i,lcs,lce,lrs,lre) #endif - if (n_way > 1) then + if (n_way > 1) then #if REALCASE == 1 - !$omp do + !$omp do #endif - do i=1,min(l_cols_tile, l_cols) - umcCPU(i,1:n_cols) = 0.0_rck - enddo + do i=1,min(l_cols_tile, l_cols) + umcCPU(i,1:n_cols) = 0.0_rck + enddo #if REALCASE == 1 - !$omp do + !$omp do #endif - do i=1,l_rows - vmrCPU(i,n_cols+1:2*n_cols) = 0.0_rck - enddo - - if (l_cols>0 .and. l_rows>0) then - - !SYMM variant 4 - !Partitioned Matrix Expression: - ! Ct = Atl Bt + Atr Bb - ! Cb = Atr' Bt + Abl Bb - ! - !Loop invariant: - ! Ct = Atl Bt + Atr Bb - ! - !Update: - ! C1 = A10'B0 + A11B1 + A21 B2 - ! - !This algorithm chosen because in this algoirhtm, the loop around the dgemm calls - !is easily parallelized, and regardless of choise of algorithm, - !the startup cost for parallelizing the dgemms inside the loop is too great + do i=1,l_rows + vmrCPU(i,n_cols+1:2*n_cols) = 0.0_rck + enddo + + if (l_cols>0 .and. l_rows>0) then + + !SYMM variant 4 + !Partitioned Matrix Expression: + ! Ct = Atl Bt + Atr Bb + ! Cb = Atr' Bt + Abl Bb + ! + !Loop invariant: + ! Ct = Atl Bt + Atr Bb + ! + !Update: + ! C1 = A10'B0 + A11B1 + A21 B2 + ! + !This algorithm chosen because in this algoirhtm, the loop around the dgemm calls + !is easily parallelized, and regardless of choise of algorithm, + !the startup cost for parallelizing the dgemms inside the loop is too great #if REALCASE == 1 - !$omp do schedule(static,1) + !$omp do schedule(static,1) #endif - do i=0,(istep*nbw-1)/tile_size - lcs = i*l_cols_tile+1 ! local column start - lce = min(l_cols, (i+1)*l_cols_tile) ! local column end - - lrs = i*l_rows_tile+1 ! local row start - lre = min(l_rows, (i+1)*l_rows_tile) ! local row end - - !C1 += [A11 A12] [B1 - ! B2] - if ( lre > lrs .and. l_cols > lcs ) then - call obj%timer%start("blas") - if (isSkewsymmetric) then - call PRECISION_GEMM('N', 'N', int(lre-lrs+1,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), & - int(l_cols-lcs+1,kind=BLAS_KIND), & - -ONE, a_mat(lrs,lcs), int(ubound(a_mat,dim=1),kind=BLAS_KIND), & - umcCPU(lcs,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND), & - ZERO, vmrCPU(lrs,n_cols+1), int(ubound(vmrCPU,dim=1),kind=BLAS_KIND) ) - else - call PRECISION_GEMM('N', 'N', int(lre-lrs+1,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), & - int(l_cols-lcs+1,kind=BLAS_KIND), & - ONE, a_mat(lrs,lcs), int(ubound(a_mat,dim=1),kind=BLAS_KIND), & - umcCPU(lcs,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND), & - ZERO, vmrCPU(lrs,n_cols+1), int(ubound(vmrCPU,dim=1),kind=BLAS_KIND) ) - - endif - call obj%timer%stop("blas") - endif - - ! C1 += A10' B0 - if ( lce > lcs .and. i > 0 ) then - call obj%timer%start("blas") - call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', & - int(lce-lcs+1,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), int(lrs-1,kind=BLAS_KIND), & - ONE, a_mat(1,lcs), int(ubound(a_mat,dim=1),kind=BLAS_KIND), & - vmrCPU(1,1), int(ubound(vmrCPU,dim=1),kind=BLAS_KIND), & - ZERO, umcCPU(lcs,1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND) ) - call obj%timer%stop("blas") - endif - enddo - endif ! l_cols>0 .and. l_rows>0 - - else ! n_way > 1 + do i=0,(istep*nbw-1)/tile_size + lcs = i*l_cols_tile+1 ! local column start + lce = min(l_cols, (i+1)*l_cols_tile) ! local column end + + lrs = i*l_rows_tile+1 ! local row start + lre = min(l_rows, (i+1)*l_rows_tile) ! local row end + + !C1 += [A11 A12] [B1 + ! B2] + if ( lre > lrs .and. l_cols > lcs ) then + call obj%timer%start("blas") + if (isSkewsymmetric) then + call PRECISION_GEMM('N', 'N', int(lre-lrs+1,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), & + int(l_cols-lcs+1,kind=BLAS_KIND), & + -ONE, a_mat(lrs,lcs), int(ubound(a_mat,dim=1),kind=BLAS_KIND), & + umcCPU(lcs,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND), & + ZERO, vmrCPU(lrs,n_cols+1), int(ubound(vmrCPU,dim=1),kind=BLAS_KIND) ) + else + call PRECISION_GEMM('N', 'N', int(lre-lrs+1,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), & + int(l_cols-lcs+1,kind=BLAS_KIND), & + ONE, a_mat(lrs,lcs), int(ubound(a_mat,dim=1),kind=BLAS_KIND), & + umcCPU(lcs,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND), & + ZERO, vmrCPU(lrs,n_cols+1), int(ubound(vmrCPU,dim=1),kind=BLAS_KIND) ) + + endif + call obj%timer%stop("blas") + endif + + ! C1 += A10' B0 + if ( lce > lcs .and. i > 0 ) then + call obj%timer%start("blas") + call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', & + int(lce-lcs+1,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), int(lrs-1,kind=BLAS_KIND), & + ONE, a_mat(1,lcs), int(ubound(a_mat,dim=1),kind=BLAS_KIND), & + vmrCPU(1,1), int(ubound(vmrCPU,dim=1),kind=BLAS_KIND), & + ZERO, umcCPU(lcs,1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND) ) + call obj%timer%stop("blas") + endif + enddo + endif ! l_cols>0 .and. l_rows>0 + + else ! n_way > 1 #endif /* WITH_OPENMP */ - if (.not. useGPU) then - umcCPU(1:l_cols,1:n_cols) = 0.0_rck - vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck + if (.not. useGPU) then + umcCPU(1:l_cols,1:n_cols) = 0.0_rck + vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck + endif ! useGPU + + if (l_cols>0 .and. l_rows>0) then + + if (useGPU) then + successCUDA = cuda_memset(vmr_dev+cur_l_rows*n_cols*size_of_datatype, & + 0, cur_l_rows*n_cols*size_of_datatype) + check_memset_cuda("bandred: vmr_dev", successCUDA) + + successCUDA = cuda_memcpy(vmr_dev, int(loc(vmrCUDA(1)),kind=c_intptr_t), & + cur_l_rows*n_cols*size_of_datatype, cudaMemcpyHostToDevice) + check_memcpy_cuda("bandred: vmrCUDA -> vmr_dev", successCUDA) + + successCUDA = cuda_memset(umc_dev, 0, l_cols*n_cols*size_of_datatype) + check_memset_cuda("bandred: umc_dev", successCUDA) + + successCUDA = cuda_memcpy(umc_dev+l_cols*n_cols*size_of_datatype, & + int(loc(umcCUDA(1+l_cols*n_cols)),kind=c_intptr_t), & + (umc_size-l_cols*n_cols)*size_of_datatype, & + cudaMemcpyHostToDevice) + check_memcpy_cuda("bandred: umcCUDA -> umc_dev", successCUDA) endif ! useGPU - if (l_cols>0 .and. l_rows>0) then + do i=0,(istep*nbw-1)/tile_size - if (useGPU) then - successCUDA = cuda_memset(vmr_dev+cur_l_rows*n_cols*size_of_datatype, & - 0, cur_l_rows*n_cols*size_of_datatype) - check_memset_cuda("bandred: vmr_dev", successCUDA) - - successCUDA = cuda_memcpy(vmr_dev, int(loc(vmrCUDA(1)),kind=c_intptr_t), & - cur_l_rows*n_cols*size_of_datatype, cudaMemcpyHostToDevice) - check_memcpy_cuda("bandred: vmrCUDA -> vmr_dev", successCUDA) - - successCUDA = cuda_memset(umc_dev, 0, l_cols*n_cols*size_of_datatype) - check_memset_cuda("bandred: umc_dev", successCUDA) - - successCUDA = cuda_memcpy(umc_dev+l_cols*n_cols*size_of_datatype, & - int(loc(umcCUDA(1+l_cols*n_cols)),kind=c_intptr_t), & - (umc_size-l_cols*n_cols)*size_of_datatype, & - cudaMemcpyHostToDevice) - check_memcpy_cuda("bandred: umcCUDA -> umc_dev", successCUDA) - endif ! useGPU + lcs = i*l_cols_tile+1 + lce = min(l_cols,(i+1)*l_cols_tile) + if (lce 1) then - successCUDA = cuda_memcpy(int(loc(vmrCUDA(1+cur_l_rows*n_cols)),kind=c_intptr_t), & - vmr_dev+cur_l_rows*n_cols*size_of_datatype, & - (vmr_size-cur_l_rows*n_cols)*size_of_datatype, cudaMemcpyDeviceToHost) - check_memcpy_cuda("bandred: vmr_dev -> vmrCUDA", successCUDA) + cur_l_cols, ONE, (vmr_dev+(cur_l_rows * n_cols)* & + size_of_datatype), & + cur_l_rows) endif - - successCUDA = cuda_memcpy(int(loc(umcCUDA(1)),kind=c_intptr_t), & - umc_dev, l_cols*n_cols*size_of_datatype, cudaMemcpyDeviceToHost) - check_memcpy_cuda("bandred: umc_dev -> umcCUDA", successCUDA) + call obj%timer%stop("cublas") + else ! useGPU + + call obj%timer%start("blas") + call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', & + int(lce-lcs+1,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), int(lre,kind=BLAS_KIND), & + ONE, a_mat(1,lcs), int(ubound(a_mat,dim=1),kind=BLAS_KIND), & + vmrCPU, int(ubound(vmrCPU,dim=1),kind=BLAS_KIND), ONE, umcCPU(lcs,1), & + int(ubound(umcCPU,dim=1),kind=BLAS_KIND) ) + call obj%timer%stop("blas") + if (i==0) cycle + lre = min(l_rows,i*l_rows_tile) + call obj%timer%start("blas") + + if (isSkewsymmetric) then + call PRECISION_GEMM('N', 'N', int(lre,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), int(lce-lcs+1,kind=BLAS_KIND), & + -ONE, a_mat(1,lcs), int(lda,kind=BLAS_KIND), & + umcCPU(lcs,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND), ONE, & + vmrCPU(1,n_cols+1), int(ubound(vmrCPU,dim=1), kind=BLAS_KIND) ) + + else + call PRECISION_GEMM('N', 'N', int(lre,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), int(lce-lcs+1,kind=BLAS_KIND), & + ONE, a_mat(1,lcs), int(lda,kind=BLAS_KIND), & + umcCPU(lcs,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND), ONE, & + vmrCPU(1,n_cols+1), int(ubound(vmrCPU,dim=1), kind=BLAS_KIND) ) + endif + call obj%timer%stop("blas") endif ! useGPU - endif ! l_cols>0 .and. l_rows>0 + enddo ! i=0,(istep*nbw-1)/tile_size + + if (useGPU) then + if (tile_size < istep*nbw .or. n_way > 1) then + successCUDA = cuda_memcpy(int(loc(vmrCUDA(1+cur_l_rows*n_cols)),kind=c_intptr_t), & + vmr_dev+cur_l_rows*n_cols*size_of_datatype, & + (vmr_size-cur_l_rows*n_cols)*size_of_datatype, cudaMemcpyDeviceToHost) + check_memcpy_cuda("bandred: vmr_dev -> vmrCUDA", successCUDA) + endif + + successCUDA = cuda_memcpy(int(loc(umcCUDA(1)),kind=c_intptr_t), & + umc_dev, l_cols*n_cols*size_of_datatype, cudaMemcpyDeviceToHost) + check_memcpy_cuda("bandred: umc_dev -> umcCUDA", successCUDA) + endif ! useGPU + endif ! l_cols>0 .and. l_rows>0 #ifdef WITH_OPENMP - endif ! n_way > 1 + endif ! n_way > 1 #if REALCASE == 1 - !$omp end parallel + !$omp end parallel #endif #endif - ! Sum up all ur(:) parts along rows and add them to the uc(:) parts - ! on the processors containing the diagonal - ! This is only necessary if ur has been calculated, i.e. if the - ! global tile size is smaller than the global remaining matrix + ! Sum up all ur(:) parts along rows and add them to the uc(:) parts + ! on the processors containing the diagonal + ! This is only necessary if ur has been calculated, i.e. if the + ! global tile size is smaller than the global remaining matrix - ! Or if we used the Algorithm 4 - if (tile_size < istep*nbw .or. n_way > 1) then - - if (useGPU) then + ! Or if we used the Algorithm 4 + if (tile_size < istep*nbw .or. n_way > 1) then - call elpa_reduce_add_vectors_& - &MATH_DATATYPE& - &_& - &PRECISION & - (obj, vmrCUDA(cur_l_rows * n_cols + 1:),cur_l_rows, & - mpi_comm_rows, umcCUDA, & - cur_l_cols, mpi_comm_cols, istep*nbw, n_cols, nblk, max_threads) - else ! useGPU + if (useGPU) then - call elpa_reduce_add_vectors_& - &MATH_DATATYPE& - &_& - &PRECISION & - (obj, vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, & - umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, & - istep*nbw, n_cols, nblk, max_threads) - endif ! useGPU - endif ! tile_size < istep*nbw .or. n_way > 1 + call elpa_reduce_add_vectors_& + &MATH_DATATYPE& + &_& + &PRECISION & + (obj, vmrCUDA(cur_l_rows * n_cols + 1:),cur_l_rows, & + mpi_comm_rows, umcCUDA, & + cur_l_cols, mpi_comm_cols, istep*nbw, n_cols, nblk, max_threads) + else ! useGPU + + call elpa_reduce_add_vectors_& + &MATH_DATATYPE& + &_& + &PRECISION & + (obj, vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, & + umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, & + istep*nbw, n_cols, nblk, max_threads) + endif ! useGPU + endif ! tile_size < istep*nbw .or. n_way > 1 - if (l_cols>0) then + if (l_cols>0) then - if (useGPU) then + if (useGPU) then #ifdef WITH_MPI - allocate(tmpCUDA(l_cols * n_cols), stat=istat, errmsg=errorMessage) - check_allocate("bandred: tmpCUDA", istat, errorMessage) + allocate(tmpCUDA(l_cols * n_cols), stat=istat, errmsg=errorMessage) + check_allocate("bandred: tmpCUDA", istat, errorMessage) - if (wantDebug) call obj%timer%start("mpi_communication") + if (wantDebug) call obj%timer%start("mpi_communication") - call mpi_allreduce(umcCUDA, tmpCUDA, int(l_cols*n_cols,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, & - MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), ierr) + call mpi_allreduce(umcCUDA, tmpCUDA, int(l_cols*n_cols,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, & + MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), ierr) - umcCUDA(1 : l_cols * n_cols) = tmpCUDA(1 : l_cols * n_cols) - if (wantDebug) call obj%timer%stop("mpi_communication") + umcCUDA(1 : l_cols * n_cols) = tmpCUDA(1 : l_cols * n_cols) + if (wantDebug) call obj%timer%stop("mpi_communication") #endif /* WITH_MPI */ if (allocated(tmpCUDA)) then