#if 0 ! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), fomerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! This particular source code file contains additions, changes and ! enhancements authored by Intel Corporation which is not part of ! the ELPA consortium. ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". ! ELPA2 -- 2-stage solver for ELPA ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". #endif subroutine M_bandred_real_PRECISSION(na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, & tmat, tmat_dev, wantDebug, useGPU, success, useQR) !------------------------------------------------------------------------------- ! bandred_real: Reduces a distributed symmetric matrix to band form ! ! Parameters ! ! na Order of matrix ! ! a(lda,matrixCols) Distributed matrix which should be reduced. ! Distribution is like in Scalapack. ! Opposed to Scalapack, a(:,:) must be set completely (upper and lower half) ! a(:,:) is overwritten on exit with the band and the Householder vectors ! in the upper half. ! ! lda Leading dimension of a ! matrixCols local columns of matrix a ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! nbw semi bandwith of output matrix ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns ! ! tmat(nbw,nbw,numBlocks) where numBlocks = (na-1)/nbw + 1 ! Factors for the Householder vectors (returned), needed for back transformation ! !------------------------------------------------------------------------------- use cuda_functions use iso_c_binding use elpa1_compute #ifdef HAVE_DETAILED_TIMINGS use timings #endif #ifdef WITH_OPENMP use omp_lib #endif use precision implicit none integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols #ifdef USE_ASSUMED_SIZE real(kind=REAL_DATATYPE) :: a(lda,*), tmat(nbw,nbw,*) #else real(kind=REAL_DATATYPE) :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks) #endif real(kind=REAL_DATATYPE) :: eps logical, intent(in) :: useGPU integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: l_cols, l_rows, vmrCols integer(kind=ik) :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow integer(kind=ik) :: istep, ncol, lch, lcx, nlc, mynlc integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile real(kind=REAL_DATATYPE) :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw) real(kind=REAL_DATATYPE), allocatable :: tmpCUDA(:), vmrCUDA(:), umcCUDA(:) real(kind=REAL_DATATYPE), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:) real(kind=REAL_DATATYPE), allocatable :: vr(:) ! needed for blocked QR decomposition integer(kind=ik) :: PQRPARAM(11), work_size real(kind=REAL_DATATYPE) :: dwork_size(1) real(kind=REAL_DATATYPE), allocatable :: work_blocked(:), tauvector(:), blockheuristic(:) ! a_dev is passed from bandred_real to trans_ev_band integer(kind=C_intptr_T) :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev #ifdef WITH_MPI integer(kind=ik), external :: numroc #endif integer(kind=ik) :: ierr integer(kind=ik) :: cur_l_rows, cur_l_cols, vmr_size, umc_size integer(kind=c_size_t) :: lc_start, lc_end integer(kind=ik) :: lr_end integer(kind=ik) :: na_cols !, na_rows logical, intent(in) :: wantDebug logical, intent(out) :: success logical :: successCUDA integer(kind=ik) :: istat character(200) :: errorMessage logical, intent(in) :: useQR integer(kind=ik) :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, & ii, pp, transformChunkSize #ifdef HAVE_DETAILED_TIMINGS call timer%start("bandred_real" + M_PRECISSION_SUFFIX) #endif #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif success = .true. ! Semibandwith nbw must be a multiple of blocksize nblk if (mod(nbw,nblk)/=0) then if (my_prow==0 .and. my_pcol==0) then if (wantDebug) then write(error_unit,*) 'ELPA2_bandred_real: ERROR: nbw=',nbw,', nblk=',nblk write(error_unit,*) 'ELPA2_bandred_real: ELPA2 works only for nbw==n*nblk' endif success = .false. return endif endif ! na_rows in used nowhere; only na_cols if (useGPU) then #ifdef WITH_MPI ! na_rows = numroc(na, nblk, my_prow, 0, np_rows) na_cols = numroc(na, nblk, my_pcol, 0, np_cols) #else ! na_rows = na na_cols = na #endif endif ! useGPU ! Matrix is split into tiles; work is done only for tiles on the diagonal or above tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide l_rows_tile = tile_size/np_rows ! local rows of a tile l_cols_tile = tile_size/np_cols ! local cols of a tile if (useQR) then if (useGPU) then print *,"qr decomposition at the moment not supported with GPU" stop endif if (which_qr_decomposition == 1) then call qr_pqrparam_init(pqrparam(1:11), nblk,'M',0, nblk,'M',0, nblk,'M',1,'s') allocate(tauvector(na), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_real: error when allocating tauvector "//errorMessage stop endif allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_real: error when allocating blockheuristic "//errorMessage stop endif l_rows = local_index(na, my_prow, np_rows, nblk, -1) allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_real: error when allocating vmrCPU "//errorMessage stop endif vmrCols = na #ifdef USE_ASSUMED_SIZE_QR call M_qr_pdgeqrf_2dcomm_PRECISSION(a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), & nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), & mpi_comm_rows, mpi_comm_cols, blockheuristic) #else call M_qr_pdgeqrf_2dcomm_PRECISSION(a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), & vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, & nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), & mpi_comm_rows, mpi_comm_cols, blockheuristic) #endif work_size = dwork_size(1) allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_real: error when allocating work_blocked "//errorMessage stop endif work_blocked = M_CONST_0_0 deallocate(vmrCPU, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_real: error when deallocating vmrCPU "//errorMessage stop endif endif ! which_qr_decomposition endif ! useQr if (useGPU) then ! Here we convert the regular host array into a pinned host array successCUDA = cuda_malloc(a_dev, lda*na_cols*M_size_of_PRECISSION_real) if (.not.(successCUDA)) then print *,"bandred_real: error in cudaMalloc" stop endif successCUDA = cuda_malloc(tmat_dev, nbw*nbw*M_size_of_PRECISSION_real) if (.not.(successCUDA)) then print *,"bandred_real: error in cudaMalloc" stop endif successCUDA = cuda_malloc(vav_dev, nbw*nbw*M_size_of_PRECISSION_real) if (.not.(successCUDA)) then print *,"bandred_real: error in cudaMalloc" stop endif cur_l_rows = 0 cur_l_cols = 0 successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)*M_size_of_PRECISSION_real, cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"bandred_real: error in cudaMemcpy" stop endif endif ! useGPU do istep = (na-1)/nbw, 1, -1 n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step ! Number of local columns/rows of remaining matrix l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1) l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1) if (useGPU) then cur_l_rows = max(l_rows, 1) cur_l_cols = max(l_cols, 1) vmr_size = cur_l_rows * 2 * n_cols umc_size = cur_l_cols * 2 * n_cols ! Allocate vmr and umc only if the inew size exceeds their current capacity ! Added for FORTRAN CALLS if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then if (allocated(vr)) then deallocate(vr, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_real: error when deallocating vr "//errorMessage stop endif endif allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_real: error when allocating vr "//errorMessage stop endif endif if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then if (allocated(vmrCUDA)) then deallocate(vmrCUDA, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_real: error when allocating vmrCUDA "//errorMessage stop endif successCUDA = cuda_free(vmr_dev) if (.not.(successCUDA)) then print *,"bandred_real: error in cuda_free" stop endif endif allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_real: error when allocating vmrCUDA "//errorMessage stop endif successCUDA = cuda_malloc(vmr_dev, vmr_size*M_size_of_PRECISSION_real) if (.not.(successCUDA)) then print *,"bandred_real: error in cudaMalloc" stop endif endif if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then if (allocated(umcCUDA)) then deallocate(umcCUDA, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_real: error when deallocating umcCUDA "//errorMessage stop endif successCUDA = cuda_free(umc_dev) if (.not.(successCUDA)) then print *,"bandred_real: error in cudaFree" stop endif endif allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_real: error when deallocating umcCUDA "//errorMessage stop endif successCUDA = cuda_malloc(umc_dev, umc_size*M_size_of_PRECISSION_real) if (.not.(successCUDA)) then print *,"bandred_real: error in cudaMalloc" stop endif endif else ! GPU not used ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_real: error when allocating vmrCPU "//errorMessage stop endif allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_real: error when allocating umcCPU "//errorMessage stop endif allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_real: error when allocating vr "//errorMessage stop endif endif ! use GPU if (useGPU) then vmrCUDA(1 : cur_l_rows * n_cols) = M_CONST_0_0 else vmrCPU(1:l_rows,1:n_cols) = M_CONST_0_0 endif vr(:) = M_CONST_0_0 tmat(:,:,istep) = M_CONST_0_0 if (useGPU) then umcCUDA(1 : umc_size) = M_CONST_0_0 lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1) lc_end = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1) lr_end = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1) if(lc_start .le. 0) lc_start = 1 ! Here we assume that the processor grid and the block grid are aligned cur_pcol = pcol(istep*nbw+1, nblk, np_cols) if(my_pcol == cur_pcol) then successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), lda*M_size_of_PRECISSION_real, & (a_dev + ((lc_start-1) * lda*M_size_of_PRECISSION_real)), & lda*M_size_of_PRECISSION_real, lr_end*M_size_of_PRECISSION_real, & (lc_end - lc_start+1), cudaMemcpyDeviceToHost) if (.not.(successCUDA)) then print *,"bandred_real: error in cudaMemcpy2d" stop endif endif endif ! useGPU ! Reduce current block to lower triangular form if (useQR) then if (which_qr_decomposition == 1) then vmrCols = 2*n_cols #ifdef USE_ASSUMED_SIZE_QR call M_qr_pdgeqrf_2dcomm_PRECISSION(a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), & na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size, & work_size, na, n_cols, nblk, nblk, & istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,& 0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,& blockheuristic) #else call M_qr_pdgeqrf_2dcomm_PRECISSION(a(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) , & max(l_rows,1), vmrCols, tauvector(1:na), na, & tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, & work_size, na, n_cols, nblk, nblk, & istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,& 0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,& blockheuristic) #endif endif else !useQR do lc = n_cols, 1, -1 ncol = istep*nbw + lc ! absolute column number of householder vector nrow = ncol - nbw ! Absolute number of pivot row lr = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number tau = 0 if (nrow == 1) exit ! Nothing to do cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block if (my_pcol==cur_pcol) then ! Get vector to be transformed; distribute last element and norm of ! remaining elements to all procs in current column vr(1:lr) = a(1:lr,lch) ! vector to be transformed if (my_prow==prow(nrow, nblk, np_rows)) then aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1)) aux1(2) = vr(lr) else aux1(1) = dot_product(vr(1:lr),vr(1:lr)) aux1(2) = M_CONST_0_0 endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_allreduce(aux1, aux2, 2, M_MPI_REAL_PRECISSION, MPI_SUM, mpi_comm_rows, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ aux2 = aux1 ! this should be optimized #endif vnorm2 = aux2(1) vrl = aux2(2) ! Householder transformation call M_hh_transform_real_PRECISSION(vrl, vnorm2, xf, tau) ! Scale vr and store Householder vector for back transformation vr(1:lr) = vr(1:lr) * xf if (my_prow==prow(nrow, nblk, np_rows)) then a(1:lr-1,lch) = vr(1:lr-1) a(lr,lch) = vrl vr(lr) = M_CONST_1_0 else a(1:lr,lch) = vr(1:lr) endif endif ! Broadcast Householder vector and tau along columns vr(lr+1) = tau #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Bcast(vr, lr+1, M_MPI_REAL_PRECISSION, cur_pcol, mpi_comm_cols, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif /* WITH_MPI */ if (useGPU) then vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr) else vmrCPU(1:lr,lc) = vr(1:lr) endif tau = vr(lr+1) tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat ! Transform remaining columns in current block with Householder vector ! Local dot product aux1 = 0 #ifdef WITH_OPENMP !Open up one omp region to avoid paying openmp overhead. !This does not help performance due to the addition of two openmp barriers around the MPI call, !But in the future this may be beneficial if these barriers are replaced with a faster implementation !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1) mynlc = 0 ! number of local columns !This loop does not have independent iterations, !'mynlc' is incremented each iteration, and it is difficult to remove this dependency !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration !That is, a thread only executes the work associated with an iteration if its thread id is congruent to !the iteration number modulo the number of threads do j=1,lc-1 lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) if (lcx>0 ) then mynlc = mynlc+1 if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx)) endif endif enddo ! Get global dot products !$omp barrier !$omp single #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, M_MPI_REAL_PRECISSION, MPI_SUM, mpi_comm_rows, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ if (mynlc>0) aux2 = aux1 #endif /* WITH_MPI */ !$omp end single !$omp barrier ! Transform transformChunkSize=32 mynlc = 0 do j=1,lc-1 lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) if (lcx>0) then mynlc = mynlc+1 !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32 !However, for some reason this is slower than doing it manually, so it is parallelized as below. do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize do pp = 1,transformChunkSize if (pp + ii > lr) exit a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp) enddo enddo endif enddo !$omp end parallel #else /* WITH_OPENMP */ nlc = 0 ! number of local columns do j=1,lc-1 lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) if (lcx>0) then nlc = nlc+1 if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx)) endif enddo ! Get global dot products #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, M_MPI_REAL_PRECISSION, MPI_SUM, mpi_comm_rows, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ if (nlc>0) aux2=aux1 #endif /* WITH_MPI */ ! Transform nlc = 0 do j=1,lc-1 lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) if (lcx>0) then nlc = nlc+1 a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr) endif enddo #endif /* WITH_OPENMP */ enddo ! lc if (useGPU) then ! store column tiles back to GPU cur_pcol = pcol(istep*nbw+1, nblk, np_cols) if (my_pcol == cur_pcol) then successCUDA = cuda_memcpy2d((a_dev+((lc_start-1)*lda*M_size_of_PRECISSION_real)), & lda*M_size_of_PRECISSION_real, loc(a(1, lc_start)), & lda*M_size_of_PRECISSION_real, lr_end*M_size_of_PRECISSION_real, & (lc_end - lc_start+1),cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"bandred_real: error in cudaMemcpy2d" stop endif endif endif ! Calculate scalar products of stored Householder vectors. ! This can be done in different ways, we use dsyrk vav = 0 if (useGPU) then if (l_rows>0) & call M_PRECISSION_SYRK('U', 'T', n_cols, l_rows, M_CONST_1_0, vmrCUDA, cur_l_rows, M_CONST_0_0, vav, ubound(vav,dim=1)) else if (l_rows>0) & call M_PRECISSION_SYRK('U', 'T', n_cols, l_rows, M_CONST_1_0, vmrCPU, ubound(vmrCPU,dim=1), M_CONST_0_0, vav, ubound(vav,dim=1)) endif call M_symm_matrix_allreduce_PRECISSION(n_cols,vav, nbw, nbw,mpi_comm_rows) ! Calculate triangular matrix T for block Householder Transformation do lc=n_cols,1,-1 tau = tmat(lc,lc,istep) if (lc vmc (stored in umc, second half) if (useGPU) then call M_elpa_transpose_vectors_real_PRECISSION (vmrCUDA, cur_l_rows, mpi_comm_rows, & umcCUDA(cur_l_cols * n_cols + 1), cur_l_cols, mpi_comm_cols, & 1, istep*nbw, n_cols, nblk) else call M_elpa_transpose_vectors_real_PRECISSION (vmrCPU, ubound(vmrCPU,dim=1), mpi_comm_rows, & umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), mpi_comm_cols, & 1, istep*nbw, n_cols, nblk) endif ! Calculate umc = A**T * vmr ! Note that the distributed A has to be transposed ! Opposed to direct tridiagonalization there is no need to use the cache locality ! of the tiles, so we can use strips of the matrix ! here the GPU version and CPU version diverged substantially, due to the newest ! optimizations due to Intel. The GPU version has to be re-written if (useGPU) then umcCUDA(1 : l_cols * n_cols) = M_CONST_0_0 vmrCUDA(cur_l_rows * n_cols + 1 : cur_l_rows * n_cols * 2) = M_CONST_0_0 if (l_cols>0 .and. l_rows>0) then successCUDA = cuda_memcpy(vmr_dev, loc(vmrCUDA(1)), vmr_size*M_size_of_PRECISSION_real,cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"bandred_real: error in cudaMemcpy" stop endif successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1)), umc_size*M_size_of_PRECISSION_real,cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"bandred_real: error in cudaMemcpy" stop endif do i=0,(istep*nbw-1)/tile_size lcs = i*l_cols_tile+1 lce = min(l_cols,(i+1)*l_cols_tile) if (lce0 .and. l_rows>0 else ! do not useGPU version !Code for Algorithm 4 n_way = 1 #ifdef WITH_OPENMP n_way = omp_get_max_threads() #endif !umc(1:l_cols,1:n_cols) = 0.d0 !vmr(1:l_rows,n_cols+1:2*n_cols) = 0 #ifdef WITH_OPENMP !$omp parallel private( i,lcs,lce,lrs,lre) #endif if (n_way > 1) then !$omp do do i=1,min(l_cols_tile, l_cols) umcCPU(i,1:n_cols) = M_CONST_0_0 enddo !$omp do do i=1,l_rows vmrCPU(i,n_cols+1:2*n_cols) = M_CONST_0_0 enddo if (l_cols>0 .and. l_rows>0) then !SYMM variant 4 !Partitioned Matrix Expression: ! Ct = Atl Bt + Atr Bb ! Cb = Atr' Bt + Abl Bb ! !Loop invariant: ! Ct = Atl Bt + Atr Bb ! !Update: ! C1 = A10'B0 + A11B1 + A21 B2 ! !This algorithm chosen because in this algoirhtm, the loop around the dgemm calls !is easily parallelized, and regardless of choise of algorithm, !the startup cost for parallelizing the dgemms inside the loop is too great !$omp do schedule(static,1) do i=0,(istep*nbw-1)/tile_size lcs = i*l_cols_tile+1 ! local column start lce = min(l_cols, (i+1)*l_cols_tile) ! local column end lrs = i*l_rows_tile+1 ! local row start lre = min(l_rows, (i+1)*l_rows_tile) ! local row end !C1 += [A11 A12] [B1 ! B2] if ( lre > lrs .and. l_cols > lcs ) then call M_PRECISSION_GEMM('N', 'N', lre-lrs+1, n_cols, l_cols-lcs+1, & M_CONST_1_0, a(lrs,lcs), ubound(a,dim=1), & umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), & M_CONST_0_0, vmrCPU(lrs,n_cols+1), ubound(vmrCPU,dim=1)) endif ! C1 += A10' B0 if ( lce > lcs .and. i > 0 ) then call M_PRECISSION_GEMM('T', 'N', lce-lcs+1, n_cols, lrs-1, & M_CONST_1_0, a(1,lcs), ubound(a,dim=1), & vmrCPU(1,1), ubound(vmrCPU,dim=1), & M_CONST_0_0, umcCPU(lcs,1), ubound(umcCPU,dim=1)) endif enddo endif ! l_cols>0 .and. l_rows>0 else ! n_way > 1 umcCPU(1:l_cols,1:n_cols) = M_CONST_0_0 vmrCPU(1:l_rows,n_cols+1:2*n_cols) = M_CONST_0_0 if (l_cols>0 .and. l_rows>0) then do i=0,(istep*nbw-1)/tile_size lcs = i*l_cols_tile+1 lce = min(l_cols,(i+1)*l_cols_tile) if (lce 1 #ifdef WITH_OPENMP !$omp end parallel #endif endif ! do not useGPU version ! Sum up all ur(:) parts along rows and add them to the uc(:) parts ! on the processors containing the diagonal ! This is only necessary if ur has been calculated, i.e. if the ! global tile size is smaller than the global remaining matrix if (useGPU) then ! here the GPU version and CPU version divereged due to the same reasons as above if (tile_size < istep*nbw) then call M_elpa_reduce_add_vectors_real_PRECISSION (vmrCUDA(cur_l_rows * n_cols + 1),cur_l_rows,mpi_comm_rows, & umcCUDA, cur_l_cols, mpi_comm_cols, & istep*nbw, n_cols, nblk) endif if (l_cols>0) then allocate(tmpCUDA(l_cols * n_cols), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_real: error when allocating tmpCUDA "//errorMessage stop endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_allreduce(umcCUDA, tmpCUDA, l_cols*n_cols, M_MPI_REAL_PRECISSION, MPI_SUM, mpi_comm_rows, ierr) umcCUDA(1 : l_cols * n_cols) = tmpCUDA(1 : l_cols * n_cols) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ ! tmpCUDA(1 : l_cols * n_cols) = umcCUDA(1 : l_cols * n_cols) #endif /* WITH_MPI */ if (allocated(tmpCUDA)) then deallocate(tmpCUDA, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_real: error when deallocating tmpCUDA "//errorMessage stop endif endif endif ! l_cols ! U = U * Tmat**T successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1)), umc_size*M_size_of_PRECISSION_real, cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"bandred_real: error in cudaMemcpy" stop endif successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*M_size_of_PRECISSION_real,cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"bandred_real: error in cudaMemcpy" stop endif call M_cublas_PRECISSION_trmm('Right', 'Upper', 'Trans', 'Nonunit', l_cols, n_cols, & M_CONST_1_0, tmat_dev, nbw, umc_dev, cur_l_cols) ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*M_size_of_PRECISSION_real,cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"bandred_real: error in cudaMemcpy" stop endif call M_cublas_PRECISSION_gemm('T', 'N', n_cols, n_cols, l_cols, & M_CONST_1_0, umc_dev, cur_l_cols, (umc_dev+(cur_l_cols * n_cols )*M_size_of_PRECISSION_real),cur_l_cols, & M_CONST_0_0, vav_dev, nbw) call M_cublas_PRECISSION_trmm('Right', 'Upper', 'Trans', 'Nonunit', n_cols, n_cols, & M_CONST_1_0, tmat_dev, nbw, vav_dev, nbw) successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev, nbw*nbw*M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost) if (.not.(successCUDA)) then print *,"bandred_real: error in cudaMemcpy" stop endif call M_symm_matrix_allreduce_PRECISSION(n_cols,vav, nbw,nbw,mpi_comm_cols) successCUDA = cuda_memcpy(vav_dev, loc(vav(1,1)), nbw*nbw*M_size_of_PRECISSION_real,cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"bandred_real: error in cudaMemcpy" stop endif ! U = U - 0.5 * V * VAV call M_cublas_PRECISSION_gemm('N', 'N', l_cols, n_cols, n_cols,& -M_CONST_0_5, (umc_dev+(cur_l_cols * n_cols )*M_size_of_PRECISSION_real),cur_l_cols, vav_dev,nbw,& M_CONST_1_0, umc_dev, cur_l_cols) successCUDA = cuda_memcpy(loc(umcCUDA(1)), umc_dev, umc_size*M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost) if (.not.(successCUDA)) then print *,"bandred_real: error in cudaMemcpy" stop endif ! Transpose umc -> umr (stored in vmr, second half) call M_elpa_transpose_vectors_real_PRECISSION (umcCUDA, cur_l_cols, mpi_comm_cols, & vmrCUDA(cur_l_rows * n_cols + 1), cur_l_rows, mpi_comm_rows, & 1, istep*nbw, n_cols, nblk) successCUDA = cuda_memcpy(vmr_dev, loc(vmrCUDA(1)), vmr_size*M_size_of_PRECISSION_real, cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"bandred_real: error in cudaMemcpy" stop endif successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1)), umc_size*M_size_of_PRECISSION_real, cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"bandred_real: error in cudaMemcpy" stop endif ! A = A - V*U**T - U*V**T do i=0,(istep*nbw-1)/tile_size lcs = i*l_cols_tile+1 lce = min(l_cols,(i+1)*l_cols_tile) lre = min(l_rows,(i+1)*l_rows_tile) if (lce 1) then call M_elpa_reduce_add_vectors_real_PRECISSION (vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, & umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, & istep*nbw, n_cols, nblk) endif if (l_cols>0) then allocate(tmpCPU(l_cols,n_cols), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_real: error when allocating tmpCPU "//errorMessage stop endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_allreduce(umcCPU, tmpCPU, l_cols*n_cols, M_MPI_REAL_PRECISSION, MPI_SUM, mpi_comm_rows, mpierr) umcCPU(1:l_cols,1:n_cols) = tmpCPU(1:l_cols,1:n_cols) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ ! tmpCPU(1:l_cols,1:n_cols) = umcCPU(1:l_cols,1:n_cols) #endif /* WITH_MPI */ deallocate(tmpCPU, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"bandred_real: error when deallocating tmpCPU "//errorMessage stop endif endif ! U = U * Tmat**T call M_PRECISSION_TRMM('Right', 'Upper', 'Trans', 'Nonunit', l_cols,n_cols, M_CONST_1_0, tmat(1,1,istep), ubound(tmat,dim=1), & umcCPU, ubound(umcCPU,dim=1)) ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T call M_PRECISSION_GEMM('T', 'N', n_cols, n_cols, l_cols, M_CONST_1_0, umcCPU, ubound(umcCPU,dim=1), umcCPU(1,n_cols+1), & ubound(umcCPU,dim=1), M_CONST_0_0, vav, ubound(vav,dim=1)) call M_PRECISSION_TRMM('Right', 'Upper', 'Trans', 'Nonunit', n_cols, n_cols, M_CONST_1_0, tmat(1,1,istep), & ubound(tmat,dim=1), vav, ubound(vav,dim=1)) call M_symm_matrix_allreduce_PRECISSION(n_cols,vav, nbw, nbw ,mpi_comm_cols) ! U = U - 0.5 * V * VAV call M_PRECISSION_GEMM('N', 'N', l_cols, n_cols, n_cols, -M_CONST_0_5, umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), vav, & ubound(vav,dim=1), M_CONST_1_0, umcCPU, ubound(umcCPU,dim=1)) ! Transpose umc -> umr (stored in vmr, second half) call M_elpa_transpose_vectors_real_PRECISSION(umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, & vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, & 1, istep*nbw, n_cols, nblk) ! A = A - V*U**T - U*V**T #ifdef WITH_OPENMP !$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend ) n_threads = omp_get_num_threads() if (mod(n_threads, 2) == 0) then n_way = 2 else n_way = 1 endif m_way = n_threads / n_way m_id = mod(omp_get_thread_num(), m_way) n_id = omp_get_thread_num() / m_way do ii=n_id*tile_size,(istep*nbw-1),tile_size*n_way i = ii / tile_size lcs = i*l_cols_tile+1 lce = min(l_cols,(i+1)*l_cols_tile) lre = min(l_rows,(i+1)*l_rows_tile) if (lce lre ) myend = lre if ( myend-mystart+1 < 1) cycle call M_PRECISSION_GEMM('N', 'T', myend-mystart+1, lce-lcs+1, 2*n_cols, -M_CONST_1_0, & vmrCPU(mystart, 1), ubound(vmrCPU,1), umcCPU(lcs,1), ubound(umcCPU,1), & M_CONST_1_0, a(mystart,lcs), ubound(a,1)) enddo !$omp end parallel #else /* WITH_OPENMP */ do i=0,(istep*nbw-1)/tile_size lcs = i*l_cols_tile+1 lce = min(l_cols,(i+1)*l_cols_tile) lre = min(l_rows,(i+1)*l_rows_tile) if (lce0) then call M_cublas_PRECISSION_gemm('T', 'N', n_cols, l_cols, l_rows, M_CONST_1_0, hvm_dev, max_local_rows, & q_dev, ldq , M_CONST_0_0, tmp_dev, n_cols) #ifdef WITH_MPI ! copy data from device to host for a later MPI_ALLREDUCE ! copy to host maybe this can be avoided this is needed if MPI is used (allreduce) successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, l_cols*n_cols*M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost) if (.not.(successCUDA)) then print *,"trans_ev_band_to_full_real: error in cudaMemcpy" stop endif #endif /* WITH_MPI */ else ! l_rows>0 #ifdef WITH_MPI tmp1(1:l_cols*n_cols) = 0 #else ! if MPI is not used (we do not need to transfer because of MPI_ALLREDUCE) we can set to zero on the device #ifdef WITH_GPU_VERSION successCUDA = cuda_memset(tmp_dev, 0, l_cols*n_cols*M_size_of_PRECISSION_real) if (.not.(successCUDA)) then print *,"trans_ev_band_to_full_real: error in cudaMemset" stop endif #endif #endif /* WITH_MPI */ endif ! l_rows>0 !#ifdef WITH_GPU_VERSION ! istat = cuda_memcpy(loc(tmp1), tmp_dev, max_local_cols*nbw*size_of_real_datatype,cudaMemcpyDeviceToHost) ! if (istat .ne. 0) then ! print *,"error in cudaMemcpy" ! stop ! endif !#endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, M_MPI_REAL_PRECISSION, MPI_SUM, mpi_comm_rows, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ ! tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols) #endif /* WITH_MPI */ !#ifdef WITH_GPU_VERSION ! istat = cuda_memcpy(tmp_dev, loc(tmp2), max_local_cols*nbw*size_of_real_datatype,cudaMemcpyHostToDevice) ! if (istat .ne. 0) then ! print *,"error in cudaMemcpy" ! stop ! endif !#endif if (l_rows>0) then #ifdef WITH_MPI ! after the mpi_allreduce we have to copy back to the device ! copy back to device successCUDA = cuda_memcpy(tmp_dev, loc(tmp2), n_cols*l_cols*M_size_of_PRECISSION_real,cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"trans_ev_band_to_full_real: error in cudaMemcpy" stop endif #endif /* WITH_MPI */ !#ifdef WITH_MPI ! it should be possible to keep tmat on the device and not copy it aroud ! ! copy to device, maybe this can be avoided tmat is input from bandred_real successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*M_size_of_PRECISSION_real,cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"trans_ev_band_to_full_real: error in cudaMemcpy" stop endif !#endif /* WITH_MPI */ call M_cublas_PRECISSION_trmm('L', 'U', 'T', 'N', n_cols, l_cols, M_CONST_1_0, tmat_dev, nbw, tmp_dev, n_cols) call M_cublas_PRECISSION_gemm('N', 'N', l_rows, l_cols, n_cols, -M_CONST_1_0, hvm_dev, max_local_rows, & tmp_dev, n_cols, M_CONST_1_0, q_dev, ldq) ! copy to host maybe this can be avoided ! this is not necessary hvm is not used anymore successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*M_size_of_PRECISSION_real),cudaMemcpyDeviceToHost) if (.not.(successCUDA)) then print *,"trans_ev_band_to_full_real: error in cudaMemcpy" stop endif endif ! l_rows > 0 !#ifdef WITH_GPU_VERSION ! istat = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_real_datatype),cudaMemcpyDeviceToHost) ! if (istat .ne. 0) then ! print *,"error in cudaMemcpy" ! stop ! endif ! !#endif enddo ! istep else ! do not useGPU ! t_blocking was formerly 2; 3 is a better choice t_blocking = 3 ! number of matrices T (tmat) which are aggregated into a new (larger) T matrix (tmat_complete) and applied at once ! we only use the t_blocking if we could call it fully, this is might be better but needs to benchmarked. ! if ( na >= ((t_blocking+1)*nbw) ) then cwy_blocking = t_blocking * nbw allocate(tmp1(max_local_cols*cwy_blocking)) allocate(tmp2(max_local_cols*cwy_blocking)) allocate(hvb(max_local_rows*cwy_blocking)) allocate(hvm(max_local_rows,cwy_blocking)) allocate(tmat_complete(cwy_blocking,cwy_blocking)) allocate(t_tmp(cwy_blocking,nbw)) allocate(t_tmp2(cwy_blocking,nbw)) ! else ! allocate(tmp1(max_local_cols*nbw)) ! allocate(tmp2(max_local_cols*nbw)) ! allocate(hvb(max_local_rows*nbw)) ! allocate(hvm(max_local_rows,nbw)) ! endif hvm = M_CONST_0_0 ! Must be set to 0 !!! hvb = M_CONST_0_0 ! Safety only l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q ! if ( na >= ((t_blocking+1)*nbw) ) then do istep=1,((na-1)/nbw-1)/t_blocking + 1 ! This the call when using na >= ((t_blocking+1)*nbw) ! n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step ! As an alternative we add some special case handling if na < cwy_blocking IF (na < cwy_blocking) THEN n_cols = MAX(0, na-nbw) IF ( n_cols .eq. 0 ) THEN EXIT END IF ELSE n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step END IF ! Broadcast all Householder vectors for current step compressed in hvb nb = 0 ns = 0 do lc = 1, n_cols ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder vector nrow = ncol - nbw ! absolute number of pivot row l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast l_colh = local_index(ncol , my_pcol, np_cols, nblk, -1) ! HV local column number if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh) nb = nb+l_rows if (lc==n_cols .or. mod(ncol,nblk)==0) then #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Bcast(hvb(ns+1), nb-ns, M_MPI_REAL_PRECISSION, pcol(ncol, nblk, np_cols), mpi_comm_cols, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif /* WITH_MPI */ ns = nb endif enddo ! Expand compressed Householder vectors into matrix hvm nb = 0 do lc = 1, n_cols nrow = (istep-1)*cwy_blocking + lc ! absolute number of pivot row l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows) if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = M_CONST_1_0 nb = nb+l_rows enddo l_rows = local_index(MIN(na,(istep+1)*cwy_blocking), my_prow, np_rows, nblk, -1) ! compute tmat2 out of tmat(:,:,) tmat_complete = 0 do i = 1, t_blocking t_cols = MIN(nbw, n_cols - (i-1)*nbw) if (t_cols <= 0) exit t_rows = (i - 1) * nbw tmat_complete(t_rows+1:t_rows+t_cols,t_rows+1:t_rows+t_cols) = tmat(1:t_cols,1:t_cols,(istep-1)*t_blocking + i) if (i > 1) then call M_PRECISSION_GEMM('T', 'N', t_rows, t_cols, l_rows, M_CONST_1_0, hvm(1,1), max_local_rows, hvm(1,(i-1)*nbw+1), & max_local_rows, M_CONST_0_0, t_tmp, cwy_blocking) #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_allreduce(t_tmp, t_tmp2, cwy_blocking*nbw, M_MPI_REAL_PRECISSION, MPI_SUM, mpi_comm_rows, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif call M_PRECISSION_TRMM('L', 'U', 'N', 'N', t_rows, t_cols, M_CONST_1_0, tmat_complete, cwy_blocking, t_tmp2, cwy_blocking) call M_PRECISSION_TRMM('R', 'U', 'N', 'N', t_rows, t_cols, -M_CONST_1_0, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, & t_tmp2, cwy_blocking) tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols) #else ! t_tmp2(1:cwy_blocking,1:nbw) = t_tmp(1:cwy_blocking,1:nbw) call M_PRECISSION_TRMM('L', 'U', 'N', 'N', t_rows, t_cols, M_CONST_1_0, tmat_complete, cwy_blocking, t_tmp, cwy_blocking) call M_PRECISSION_TRMM('R', 'U', 'N', 'N', t_rows, t_cols, -M_CONST_1_0, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, & t_tmp, cwy_blocking) tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp(1:t_rows,1:t_cols) #endif ! call M_PRECISSION_TRMM('L', 'U', 'N', 'N', t_rows, t_cols, M_CONST_1_0, tmat_complete, cwy_blocking, t_tmp2, cwy_blocking) ! call M_PRECISSION_TRMM('R', 'U', 'N', 'N', t_rows, t_cols, -M_CONST_1_0, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, & ! t_tmp2, cwy_blocking) ! tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols) endif enddo ! Q = Q - V * T**T * V**T * Q if (l_rows>0) then call M_PRECISSION_GEMM('T', 'N', n_cols, l_cols, l_rows, M_CONST_1_0, hvm, ubound(hvm,dim=1), & q, ldq, M_CONST_0_0, tmp1, n_cols) else ! l_rows>0 tmp1(1:l_cols*n_cols) = M_CONST_0_0 endif ! l_rows>0 #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, M_MPI_REAL_PRECISSION, MPI_SUM, mpi_comm_rows ,mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif if (l_rows>0) then call M_PRECISSION_TRMM('L', 'U', 'T', 'N', n_cols, l_cols, M_CONST_1_0, tmat_complete, cwy_blocking, tmp2, n_cols) call M_PRECISSION_GEMM('N', 'N', l_rows, l_cols, n_cols, -M_CONST_1_0, hvm, ubound(hvm,dim=1), tmp2, n_cols, M_CONST_1_0, q, ldq) endif #else /* WITH_MPI */ ! tmp2 = tmp1 if (l_rows>0) then call M_PRECISSION_TRMM('L', 'U', 'T', 'N', n_cols, l_cols, M_CONST_1_0, tmat_complete, cwy_blocking, tmp1, n_cols) call M_PRECISSION_GEMM('N', 'N', l_rows, l_cols, n_cols, -M_CONST_1_0, hvm, ubound(hvm,dim=1), tmp1, n_cols, M_CONST_1_0, q, ldq) endif #endif /* WITH_MPI */ ! if (l_rows>0) then ! call M_PRECISSION_TRMM('L', 'U', 'T', 'N', n_cols, l_cols, M_CONST_1_0, tmat_complete, cwy_blocking, tmp2, n_cols) ! call M_PRECISSION_GEMM('N', 'N', l_rows, l_cols, n_cols, -M_CONST_1_0, hvm, ubound(hvm,dim=1), tmp2, n_cols, M_CONST_1_0, q, ldq) ! endif enddo ! istep endif ! useGPU deallocate(tmp1, tmp2, hvb, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_band_to_full_real: error when deallocating tmp1 tmp2 hvb "//errorMessage stop endif if (useGPU) then successCUDA = cuda_free(hvm_dev) if (.not.(successCUDA)) then print *,"trans_ev_band_to_full_real: error in cudaFree" stop endif successCUDA = cuda_free(tmp_dev) if (.not.(successCUDA)) then print *,"trans_ev_band_to_full_real: error in cudaFree" stop endif successCUDA = cuda_free(tmat_dev) if (.not.(successCUDA)) then print *,"trans_ev_band_to_full_real: error in cudaFree" stop endif ! final transfer of q_dev successCUDA = cuda_memcpy(loc(q), q_dev, ldq*matrixCols*M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost) if (.not.(successCUDA)) then print *,"trans_ev_band_to_full_real: error in cudaFree" stop endif ! q(1:ldq,1:na_cols) = q_temp(1:ldq,1:na_cols) successCUDA = cuda_free(q_dev) if (.not.(successCUDA)) then print *,"trans_ev_band_to_full_real: error in cudaFree" stop endif ! deallocate(q_temp, stat=istat, errmsg=errorMessage) ! if (istat .ne. 0) then ! print *,"error when deallocating q_temp "//errorMessage ! stop ! endif ! deallocate(tmat_temp, stat=istat, errmsg=errorMessage) ! if (istat .ne. 0) then ! print *,"trans_ev_band_to_full_real: error when deallocating tmat_temp "//errorMessage ! stop ! endif endif ! useGPU deallocate(hvm, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_band_to_full_real: error when deallocating hvm "//errorMessage stop endif if (useQr) then deallocate(tmat_complete, t_tmp, t_tmp2, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_band_to_full_real: error when deallocating tmat_complete, t_tmp, t_tmp2 "//errorMessage stop endif endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("trans_ev_band_to_full_real" + M_PRECISSION_SUFFIX) #endif end subroutine M_trans_ev_band_to_full_real_PRECISSION subroutine M_tridiag_band_real_PRECISSION(na, nb, nblk, a, lda, d, e, matrixCols, hh_trans_real, & mpi_comm_rows, mpi_comm_cols, mpi_comm) !------------------------------------------------------------------------------- ! tridiag_band_real: ! Reduces a real symmetric band matrix to tridiagonal form ! ! na Order of matrix a ! ! nb Semi bandwith ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! a(lda,matrixCols) Distributed system matrix reduced to banded form in the upper diagonal ! ! lda Leading dimension of a ! matrixCols local columns of matrix a ! ! d(na) Diagonal of tridiagonal matrix, set only on PE 0 (output) ! ! e(na) Subdiagonal of tridiagonal matrix, set only on PE 0 (output) ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns ! mpi_comm ! MPI-Communicator for the total processor set !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik), intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm #ifdef USE_ASSUMED_SIZE real(kind=REAL_DATATYPE), intent(in) :: a(lda,*) #else real(kind=REAL_DATATYPE), intent(in) :: a(lda,matrixCols) #endif real(kind=REAL_DATATYPE), intent(out) :: d(na), e(na) ! set only on PE 0 real(kind=REAL_DATATYPE), intent(out), & allocatable :: hh_trans_real(:,:) real(kind=REAL_DATATYPE) :: vnorm2, hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf real(kind=REAL_DATATYPE) :: hd(nb), hs(nb) integer(kind=ik) :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt integer(kind=ik) :: my_pe, n_pes, mpierr integer(kind=ik) :: my_prow, np_rows, my_pcol, np_cols integer(kind=ik) :: ireq_ab, ireq_hv integer(kind=ik) :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off #ifdef WITH_OPENMP integer(kind=ik) :: max_threads, my_thread, my_block_s, my_block_e, iter #ifdef WITH_MPI ! integer(kind=ik) :: my_mpi_status(MPI_STATUS_SIZE) #endif ! integer(kind=ik), allocatable :: mpi_statuses(:,:), global_id_tmp(:,:) integer(kind=ik), allocatable :: global_id_tmp(:,:) integer(kind=ik), allocatable :: omp_block_limits(:) real(kind=REAL_DATATYPE), allocatable :: hv_t(:,:), tau_t(:) #endif integer(kind=ik), allocatable :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:) integer(kind=ik), allocatable :: limits(:), snd_limits(:,:) integer(kind=ik), allocatable :: block_limits(:) real(kind=REAL_DATATYPE), allocatable :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:) #ifdef WITH_OPENMP integer(kind=ik) :: omp_get_max_threads #endif integer :: istat character(200) :: errorMessage #ifndef WITH_MPI integer(kind=ik) :: startAddr #endif #ifdef HAVE_DETAILED_TIMINGS call timer%start("tridiag_band_real" + M_PRECISSION_SUFFIX) #endif #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_comm_rank(mpi_comm,my_pe,mpierr) call mpi_comm_size(mpi_comm,n_pes,mpierr) call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif ! Get global_id mapping 2D procssor coordinates to global id allocate(global_id(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_real: error when allocating global_id "//errorMessage stop endif global_id(:,:) = 0 global_id(my_prow, my_pcol) = my_pe #ifdef WITH_OPENMP allocate(global_id_tmp(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_real: error when allocating global_id_tmp "//errorMessage stop endif #endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif #ifndef WITH_OPENMP call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr) #else global_id_tmp(:,:) = global_id(:,:) call mpi_allreduce(global_id_tmp, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr) deallocate(global_id_tmp, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_real: error when deallocating global_id_tmp "//errorMessage stop endif #endif /* WITH_OPENMP */ #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif /* WITH_MPI */ ! Total number of blocks in the band: nblocks_total = (na-1)/nb + 1 ! Set work distribution allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_real: error when allocating block_limits"//errorMessage stop endif call divide_band(nblocks_total, n_pes, block_limits) ! nblocks: the number of blocks for my task nblocks = block_limits(my_pe+1) - block_limits(my_pe) ! allocate the part of the band matrix which is needed by this PE ! The size is 1 block larger than needed to avoid extensive shifts allocate(ab(2*nb,(nblocks+1)*nb), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_real: error when allocating ab"//errorMessage stop endif ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety ! n_off: Offset of ab within band n_off = block_limits(my_pe)*nb ! Redistribute band in a to ab call M_redist_band_real_PRECISSION(a, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, ab) ! Calculate the workload for each sweep in the back transformation ! and the space requirements to hold the HH vectors allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_real: error when allocating limits"//errorMessage stop endif call determine_workload(na, nb, np_rows, limits) max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1)) num_hh_vecs = 0 num_chunks = 0 nx = na do n = 1, nblocks_total call determine_workload(nx, nb, np_rows, limits) local_size = limits(my_prow+1) - limits(my_prow) ! add to number of householder vectors ! please note: for nx==1 the one and only HH vector is 0 and is neither calculated nor send below! if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then num_hh_vecs = num_hh_vecs + local_size num_chunks = num_chunks+1 endif nx = nx - nb enddo ! Allocate space for HH vectors allocate(hh_trans_real(nb,num_hh_vecs), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_real: error when allocating hh_trans_real"//errorMessage stop endif ! Allocate and init MPI requests allocate(ireq_hhr(num_chunks), stat=istat, errmsg=errorMessage) ! Recv requests if (istat .ne. 0) then print *,"tridiag_band_real: error when allocating ireq_hhr"//errorMessage stop endif allocate(ireq_hhs(nblocks), stat=istat, errmsg=errorMessage) ! Send requests if (istat .ne. 0) then print *,"tridiag_band_real: error when allocating ireq_hhs"//errorMessage stop endif num_hh_vecs = 0 num_chunks = 0 nx = na nt = 0 do n = 1, nblocks_total call determine_workload(nx, nb, np_rows, limits) local_size = limits(my_prow+1) - limits(my_prow) if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then num_chunks = num_chunks+1 #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_irecv(hh_trans_real(1,num_hh_vecs+1), nb*local_size, M_MPI_REAL_PRECISSION, nt, & 10+n-block_limits(nt), mpi_comm, ireq_hhr(num_chunks), mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ ! carefull non-block recv data copy must be done at wait or send ! hh_trans_real(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk) #endif /* WITH_MPI */ num_hh_vecs = num_hh_vecs + local_size endif nx = nx - nb if (n == block_limits(nt+1)) then nt = nt + 1 endif enddo #ifdef WITH_MPI ireq_hhs(:) = MPI_REQUEST_NULL #endif ! Buffers for gathering/sending the HH vectors allocate(hh_gath(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! gathers HH vectors if (istat .ne. 0) then print *,"tridiag_band_real: error when allocating hh_gath"//errorMessage stop endif allocate(hh_send(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! send buffer for HH vectors if (istat .ne. 0) then print *,"tridiag_band_real: error when allocating hh_send"//errorMessage stop endif hh_gath(:,:,:) = M_CONST_0_0 hh_send(:,:,:) = M_CONST_0_0 ! Some counters allocate(hh_cnt(nblocks), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_real: error when allocating hh_cnt"//errorMessage stop endif allocate(hh_dst(nblocks), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_real: error when allocating hh_dst"//errorMessage stop endif hh_cnt(:) = 1 ! The first transfomation vector is always 0 and not calculated at all hh_dst(:) = 0 ! PE number for receive #ifdef WITH_MPI ireq_ab = MPI_REQUEST_NULL ireq_hv = MPI_REQUEST_NULL #endif ! Limits for sending allocate(snd_limits(0:np_rows,nblocks), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_real: error when allocating snd_limits"//errorMessage stop endif do iblk=1,nblocks call determine_workload(na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk)) enddo #ifdef WITH_OPENMP ! OpenMP work distribution: max_threads = 1 max_threads = omp_get_max_threads() ! For OpenMP we need at least 2 blocks for every thread max_threads = MIN(max_threads, nblocks/2) if (max_threads==0) max_threads = 1 allocate(omp_block_limits(0:max_threads), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_real: error when allocating omp_block_limits"//errorMessage stop endif ! Get the OpenMP block limits call divide_band(nblocks, max_threads, omp_block_limits) allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_real: error when allocating hv_t, tau_t"//errorMessage stop endif hv_t = 0 tau_t = 0 #endif /* WITH_OPENMP */ ! --------------------------------------------------------------------------- ! Start of calculations na_s = block_limits(my_pe)*nb + 1 if (my_pe>0 .and. na_s<=na) then ! send first column to previous PE ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also) ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off) #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_isend(ab_s, nb+1, M_MPI_REAL_PRECISSION, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif /* WITH_MPI */ endif #ifndef WITH_MPI startAddr = ubound(hh_trans_real,dim=2) #endif #ifdef WITH_OPENMP do istep=1,na-1-block_limits(my_pe)*nb #else do istep=1,na-1 #endif if (my_pe==0) then n = MIN(na-na_s,nb) ! number of rows to be reduced hv(:) = M_CONST_0_0 tau = M_CONST_0_0 ! The last step (istep=na-1) is only needed for sending the last HH vectors. ! We don't want the sign of the last element flipped (analogous to the other sweeps) if (istep < na-1) then ! Transform first column of remaining matrix vnorm2 = sum(ab(3:n+1,na_s-n_off)**2) call M_hh_transform_real_PRECISSION(ab(2,na_s-n_off),vnorm2,hf,tau) hv(1) = M_CONST_1_0 hv(2:n) = ab(3:n+1,na_s-n_off)*hf endif d(istep) = ab(1,na_s-n_off) e(istep) = ab(2,na_s-n_off) if (istep == na-1) then d(na) = ab(1,na_s+1-n_off) e(na) = M_CONST_0_0 endif else if (na>na_s) then ! Receive Householder vector from previous task, from PE owning subdiagonal #ifdef WITH_OPENMP #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_recv(hv, nb, M_MPI_REAL_PRECISSION, my_pe-1, 2, mpi_comm, MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ hv(1:nb) = hv_s(1:nb) #endif /* WITH_MPI */ #else /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_recv(hv, nb, M_MPI_REAL_PRECISSION, my_pe-1, 2, mpi_comm, MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ hv(1:nb) = hv_s(1:nb) #endif /* WITH_MPI */ #endif /* WITH_OPENMP */ tau = hv(1) hv(1) = M_CONST_1_0 endif endif na_s = na_s+1 if (na_s-n_off > nb) then ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb) ab(:,nblocks*nb+1:(nblocks+1)*nb) = M_CONST_0_0 n_off = n_off + nb endif #ifdef WITH_OPENMP if (max_threads > 1) then ! Codepath for OpenMP ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread! ! Every thread is one reduction cycle behind its predecessor and thus starts one step later. ! This simulates the behaviour of the MPI tasks which also work after each other. ! The code would be considerably easier, if the MPI communication would be made within ! the parallel region - this is avoided here since this would require ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program. hv_t(:,1) = hv tau_t(1) = tau do iter = 1, 2 ! iter=1 : work on first block ! iter=2 : work on remaining blocks ! This is done in 2 iterations so that we have a barrier in between: ! After the first iteration, it is guaranteed that the last row of the last block ! is completed by the next thread. ! After the first iteration it is also the place to exchange the last row ! with MPI calls #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif !$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, & !$omp& nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads) do my_thread = 1, max_threads if (iter == 1) then my_block_s = omp_block_limits(my_thread-1) + 1 my_block_e = my_block_s else my_block_s = omp_block_limits(my_thread-1) + 2 my_block_e = omp_block_limits(my_thread) endif do iblk = my_block_s, my_block_e ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block ne = ns+nb-1 ! last column in block if (istepna) exit hv = hv_t(:,my_thread) tau = tau_t(my_thread) ! Store Householder vector for back transformation hh_cnt(iblk) = hh_cnt(iblk) + 1 hh_gath(1 ,hh_cnt(iblk),iblk) = tau hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb) nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) ! Note that nr>=0 implies that diagonal block is full (nc==nb)! ! Transform diagonal block call M_PRECISSION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, M_CONST_0_0, hd, 1) x = dot_product(hv(1:nc),hd(1:nc))*tau hd(1:nc) = hd(1:nc) - M_CONST_0_5*x*hv(1:nc) call M_PRECISSION_SYR2('L', nc, -M_CONST_1_0 ,hd, 1, hv, 1, ab(1,ns), 2*nb-1) hv_t(:,my_thread) = M_CONST_0_0 tau_t(my_thread) = M_CONST_0_0 if (nr<=0) cycle ! No subdiagonal block present any more ! Transform subdiagonal block call M_PRECISSION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, M_CONST_0_0, hs, 1) if (nr>1) then ! complete (old) Householder transformation for first column ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1 ! calculate new Householder transformation for first column ! (stored in hv_t(:,my_thread) and tau_t(my_thread)) vnorm2 = sum(ab(nb+2:nb+nr,ns)**2) call M_hh_transform_real_PRECISSION(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread)) hv_t(1 ,my_thread) = M_CONST_1_0 hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf ab(nb+2:,ns) = M_CONST_0_0 ! update subdiagonal block for old and new Householder transformation ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster call M_PRECISSION_GEMV('T',nr, nb-1, tau_t(my_thread), ab(nb,ns+1), 2*nb-1, hv_t(1,my_thread), 1, M_CONST_0_0, h(2), 1) x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread) h(2:nb) = h(2:nb) - x*hv(2:nb) ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2") do i=2,nb ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_t(1:nr,my_thread)*h(i) - hs(1:nr)*hv(i) enddo else ! No new Householder transformation for nr=1, just complete the old one ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1 do i=2,nb ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i) enddo ! For safety: there is one remaining dummy transformation (but tau is 0 anyways) hv_t(1,my_thread) = M_CONST_1_0 endif enddo enddo ! my_thread !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif if (iter==1) then ! We are at the end of the first block ! Send our first column to previous PE if (my_pe>0 .and. na_s <= na) then #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_wait(ireq_ab, MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off) #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_isend(ab_s, nb+1, M_MPI_REAL_PRECISSION, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif /* WITH_MPI */ endif ! Request last column from next PE ne = na_s + nblocks*nb - (max_threads-1) - 1 #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif if (istep>=max_threads .and. ne <= na) then call mpi_recv(ab(1,ne-n_off), nb+1, M_MPI_REAL_PRECISSION, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr) endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ if (istep>=max_threads .and. ne <= na) then ab(1:nb+1,ne-n_off) = ab_s(1:nb+1) endif #endif /* WITH_MPI */ else ! We are at the end of all blocks ! Send last HH vector and TAU to next PE if it has been calculated above ne = na_s + nblocks*nb - (max_threads-1) - 1 if (istep>=max_threads .and. ne < na) then #ifdef WITH_MPI call mpi_wait(ireq_hv, MPI_STATUS_IGNORE, mpierr) #endif hv_s(1) = tau_t(max_threads) hv_s(2:) = hv_t(2:,max_threads) #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_isend(hv_s, nb, M_MPI_REAL_PRECISSION, my_pe+1, 2, mpi_comm, ireq_hv, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif /* WITH_MPI */ endif ! "Send" HH vector and TAU to next OpenMP thread do my_thread = max_threads, 2, -1 hv_t(:,my_thread) = hv_t(:,my_thread-1) tau_t(my_thread) = tau_t(my_thread-1) enddo endif enddo ! iter else ! Codepath for 1 thread without OpenMP ! The following code is structured in a way to keep waiting times for ! other PEs at a minimum, especially if there is only one block. ! For this reason, it requests the last column as late as possible ! and sends the Householder vector and the first column as early ! as possible. #endif /* WITH_OPENMP */ do iblk=1,nblocks ns = na_s + (iblk-1)*nb - n_off ! first column in block ne = ns+nb-1 ! last column in block if (ns+n_off>na) exit ! Store Householder vector for back transformation hh_cnt(iblk) = hh_cnt(iblk) + 1 hh_gath(1 ,hh_cnt(iblk),iblk) = tau hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb) #ifndef WITH_OPENMP if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then ! Wait for last transfer to finish #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif ! Copy vectors into send buffer hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk) ! Send to destination #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), M_MPI_REAL_PRECISSION, & global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1,np_cols)), & 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ ! do the post-poned irecv here startAddr = startAddr - hh_cnt(iblk) hh_trans_real(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk) #endif /* WITH_MPI */ ! Reset counter and increase destination row hh_cnt(iblk) = M_CONST_0_0 hh_dst(iblk) = hh_dst(iblk)+1 endif ! The following code is structured in a way to keep waiting times for ! other PEs at a minimum, especially if there is only one block. ! For this reason, it requests the last column as late as possible ! and sends the Householder vector and the first column as early ! as possible. #endif /* WITH_OPENMP */ nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) ! Note that nr>=0 implies that diagonal block is full (nc==nb)! ! Multiply diagonal block and subdiagonal block with Householder vector if (iblk==nblocks .and. nc==nb) then ! We need the last column from the next PE. ! First do the matrix multiplications without last column ... ! Diagonal block, the contribution of the last element is added below! ab(1,ne) = M_CONST_0_0 call M_PRECISSION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, M_CONST_0_0, hd, 1) ! Subdiagonal block if (nr>0) call M_PRECISSION_GEMV('N', nr, nb-1, tau, ab(nb+1,ns), 2*nb-1, hv, 1, M_CONST_0_0, hs, 1) ! ... then request last column ... #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif #ifdef WITH_OPENMP call mpi_recv(ab(1,ne), nb+1, M_MPI_REAL_PRECISSION, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr) #else call mpi_recv(ab(1,ne), nb+1, M_MPI_REAL_PRECISSION, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr) #endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ ab(1:nb+1,ne) = ab_s(1:nb+1) #endif /* WITH_MPI */ ! ... and complete the result hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb) hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau else ! Normal matrix multiply call M_PRECISSION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, M_CONST_0_0, hd, 1) if (nr>0) call M_PRECISSION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, M_CONST_0_0, hs, 1) endif ! Calculate first column of subdiagonal block and calculate new ! Householder transformation for this column hv_new(:) = M_CONST_0_0 ! Needed, last rows must be 0 for nr < nb tau_new = M_CONST_0_0 if (nr>0) then ! complete (old) Householder transformation for first column ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1 ! calculate new Householder transformation ... if (nr>1) then vnorm2 = sum(ab(nb+2:nb+nr,ns)**2) call M_hh_transform_real_PRECISSION(ab(nb+1,ns),vnorm2,hf,tau_new) hv_new(1) = M_CONST_1_0 hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf ab(nb+2:,ns) = M_CONST_0_0 endif ! ... and send it away immediatly if this is the last block if (iblk==nblocks) then #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif #ifdef WITH_OPENMP call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) #else call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) #endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif /* WITH_MPI */ hv_s(1) = tau_new hv_s(2:) = hv_new(2:) #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_isend(hv_s, nb, M_MPI_REAL_PRECISSION, my_pe+1, 2, mpi_comm, ireq_hv, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif /* WITH_MPI */ endif endif ! Transform diagonal block x = dot_product(hv(1:nc),hd(1:nc))*tau hd(1:nc) = hd(1:nc) - M_CONST_0_5*x*hv(1:nc) if (my_pe>0 .and. iblk==1) then ! The first column of the diagonal block has to be send to the previous PE ! Calculate first column only ... ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*hv(1) - hv(1:nc)*hd(1) ! ... send it away ... #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif #ifdef WITH_OPENMP call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) #else call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) #endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif /* WITH_MPI */ ab_s(1:nb+1) = ab(1:nb+1,ns) #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_isend(ab_s, nb+1, M_MPI_REAL_PRECISSION, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif ! ... and calculate remaining columns with rank-2 update if (nc>1) call M_PRECISSION_SYR2('L', nc-1, -M_CONST_1_0, hd(2), 1, hv(2), 1, ab(1,ns+1), 2*nb-1) else ! No need to send, just a rank-2 update call M_PRECISSION_SYR2('L', nc, -M_CONST_1_0, hd, 1, hv, 1, ab(1,ns), 2*nb-1) endif ! Do the remaining double Householder transformation on the subdiagonal block cols 2 ... nb if (nr>0) then if (nr>1) then call M_PRECISSION_GEMV('T', nr, nb-1, tau_new, ab(nb,ns+1), 2*nb-1, hv_new, 1, M_CONST_0_0, h(2), 1) x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new h(2:nb) = h(2:nb) - x*hv(2:nb) ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update do i=2,nb ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*h(i) - hs(1:nr)*hv(i) enddo else ! No double Householder transformation for nr=1, just complete the row do i=2,nb ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i) enddo endif endif ! Use new HH vector for the next block hv(:) = hv_new(:) tau = tau_new enddo #ifdef WITH_OPENMP endif do iblk = 1, nblocks if (hh_dst(iblk) >= np_rows) exit if (snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then ! Wait for last transfer to finish #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif ! Copy vectors into send buffer hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk) ! Send to destination #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), M_MPI_REAL_PRECISSION, & global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1, np_cols)), & 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ ! do the post-poned irecv here startAddr = startAddr - hh_cnt(iblk) hh_trans_real(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk) #endif /* WITH_MPI */ ! Reset counter and increase destination row hh_cnt(iblk) = M_CONST_0_0 hh_dst(iblk) = hh_dst(iblk)+1 endif enddo #endif /* WITH_OPENMP */ enddo ! istep ! Finish the last outstanding requests #ifdef WITH_OPENMP #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) ! allocate(mpi_statuses(MPI_STATUS_SIZE,max(nblocks,num_chunks)), stat=istat, errmsg=errorMessage) ! if (istat .ne. 0) then ! print *,"tridiag_band_real: error when allocating mpi_statuses"//errorMessage ! stop ! endif call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr) call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr) ! deallocate(mpi_statuses, stat=istat, errmsg=errorMessage) ! if (istat .ne. 0) then ! print *,"tridiag_band_real: error when deallocating mpi_statuses"//errorMessage ! stop ! endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr) call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif #endif /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_barrier(mpi_comm,mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif deallocate(ab, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_real: error when deallocating ab"//errorMessage stop endif deallocate(ireq_hhr, ireq_hhs, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_real: error when deallocating ireq_hhr, ireq_hhs"//errorMessage stop endif deallocate(hh_cnt, hh_dst, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_real: error when deallocating hh_cnt, hh_dst"//errorMessage stop endif deallocate(hh_gath, hh_send, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_real: error when deallocating hh_gath, hh_send"//errorMessage stop endif deallocate(limits, snd_limits, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_real: error when deallocating limits, send_limits"//errorMessage stop endif deallocate(block_limits, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_real: error when deallocating block_limits"//errorMessage stop endif deallocate(global_id, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_real: error when allocating global_id"//errorMessage stop endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("tridiag_band_real" + M_PRECISSION_SUFFIX) #endif end subroutine M_tridiag_band_real_PRECISSION subroutine M_trans_ev_tridi_to_band_real_PRECISSION(na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, hh_trans_real, & mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, & THIS_REAL_ELPA_KERNEL) !------------------------------------------------------------------------------- ! trans_ev_tridi_to_band_real: ! Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix ! ! Parameters ! ! na Order of matrix a, number of rows of matrix q ! ! nev Number eigenvectors to compute (= columns of matrix q) ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! nb semi bandwith ! ! q On input: Eigenvectors of tridiagonal matrix ! On output: Transformed eigenvectors ! Distribution is like in Scalapack. ! ! ldq Leading dimension of q ! matrixCols local columns of matrix q ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns/both ! !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use cuda_functions use precision use pack_unpack_real use pack_unpack_real_gpu use compute_hh_trafo_real use iso_c_binding implicit none logical, intent(in) :: useGPU integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols #ifdef USE_ASSUMED_SIZE real(kind=REAL_DATATYPE) :: q(ldq,*) #else real(kind=REAL_DATATYPE) :: q(ldq,matrixCols) #endif integer(kind=c_intptr_t) :: q_dev real(kind=REAL_DATATYPE), intent(in) :: hh_trans_real(:,:) integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol integer(kind=ik) :: i, j, ip, sweep, nbuf, l_nev, a_dim2 integer(kind=ik) :: current_n, current_local_n, current_n_start, current_n_end integer(kind=ik) :: next_n, next_local_n, next_n_start, next_n_end integer(kind=ik) :: bottom_msg_length, top_msg_length, next_top_msg_length integer(kind=ik) :: stripe_width, last_stripe_width, stripe_count #ifdef WITH_OPENMP integer(kind=ik) :: thread_width, csw, b_off, b_len #endif integer(kind=ik) :: num_result_blocks, num_result_buffers, num_bufs_recvd integer(kind=ik) :: a_off, current_tv_off, max_blk_size integer(kind=ik) :: mpierr, src, src_offset, dst, offset, nfact, num_blk #ifdef WITH_OPENMP #ifdef WITH_MPI ! integer(kind=ik) :: my_MPI_STATUS_(MPI_STATUS_SIZE) #endif #endif logical :: flag #ifdef WITH_OPENMP real(kind=REAL_DATATYPE), pointer :: aIntern(:,:,:,:) #else real(kind=REAL_DATATYPE), pointer :: aIntern(:,:,:) #endif real(kind=REAL_DATATYPE) :: a_real type(c_ptr) :: aIntern_ptr real(kind=REAL_DATATYPE) , allocatable :: row(:) real(kind=REAL_DATATYPE) , allocatable :: row_group(:,:) #ifdef WITH_OPENMP real(kind=REAL_DATATYPE), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:) real(kind=REAL_DATATYPE), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:) #else real(kind=REAL_DATATYPE), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:) real(kind=REAL_DATATYPE), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:) #endif real(kind=REAL_DATATYPE), allocatable :: result_buffer(:,:,:) real(kind=REAL_DATATYPE), allocatable :: bcast_buffer(:,:) integer(kind=ik) :: tmp ! real*8, allocatable, device :: a_dev(:,:,:) ! real*8, allocatable, device :: bcast_buffer_dev(:,:) ! real*8, allocatable, device :: row_dev(:) ! real*8, allocatable, device :: row_group_dev(:,:) ! real*8, allocatable, device :: hh_dot_dev(:) ! real*8, allocatable, device :: hh_tau_dev(:) integer(kind=c_intptr_t) :: aIntern_dev integer(kind=c_intptr_t) :: bcast_buffer_dev integer(kind=c_size_t) :: num integer(kind=c_size_t) :: dev_offset, dev_offset_1 integer(kind=c_intptr_t) :: row_dev integer(kind=c_intptr_t) :: row_group_dev integer(kind=c_intptr_t) :: hh_dot_dev integer(kind=c_intptr_t) :: hh_tau_dev Integer(kind=ik) :: top, chunk, this_chunk integer(kind=ik) :: row_group_size, unpack_idx integer(kind=ik) :: n_off integer(kind=ik), allocatable :: result_send_request(:), result_recv_request(:), limits(:) integer(kind=ik), allocatable :: top_send_request(:), bottom_send_request(:) integer(kind=ik), allocatable :: top_recv_request(:), bottom_recv_request(:) #ifdef WITH_OPENMP ! integer(kind=ik), allocatable :: mpi_statuses(:,:) #endif ! MPI send/recv tags, arbitrary integer(kind=ik), parameter :: bottom_recv_tag = 111 integer(kind=ik), parameter :: top_recv_tag = 222 integer(kind=ik), parameter :: result_recv_tag = 333 ! Just for measuring the kernel performance real(kind=c_double) :: kernel_time, kernel_time_recv ! MPI_WTIME always needs double ! long integer integer(kind=lik) :: kernel_flops, kernel_flops_recv #ifdef WITH_OPENMP integer(kind=ik) :: max_threads, my_thread integer(kind=ik) :: omp_get_max_threads #endif logical, intent(in) :: wantDebug logical :: success integer(kind=ik) :: istat character(200) :: errorMessage logical :: successCUDA #ifndef WITH_MPI integer(kind=ik) :: j1 #endif #ifdef HAVE_DETAILED_TIMINGS call timer%start("trans_ev_tridi_to_band_real" + M_PRECISSION_SUFFIX) #endif #ifdef WITH_GPU_VERSION unpack_idx = 0 row_group_size = 0 #endif success = .true. kernel_time = 0.0 kernel_flops = 0 #ifdef WITH_OPENMP max_threads = 1 max_threads = omp_get_max_threads() #endif #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr) call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr) call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr) call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif if (mod(nbw,nblk)/=0) then if (my_prow==0 .and. my_pcol==0) then if (wantDebug) then write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_real: ERROR: nbw=',nbw,', nblk=',nblk write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_real: band backtransform works only for nbw==n*nblk' endif success = .false. return endif endif nfact = nbw / nblk ! local number of eigenvectors l_nev = local_index(nev, my_pcol, np_cols, nblk, -1) if (l_nev==0) then #ifdef WITH_OPENMP thread_width = 0 #endif stripe_width = 0 stripe_count = 0 last_stripe_width = 0 else ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into ! every primary cache if (.not.(useGPU)) then #ifdef WITH_OPENMP thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread #endif #ifdef DOUBLE_PRECISION_REAL stripe_width = 48 ! Must be a multiple of 4 #else stripe_width = 96 ! Must be a multiple of 8 #endif #ifdef WITH_OPENMP stripe_count = (thread_width-1)/stripe_width + 1 #else stripe_count = (l_nev-1)/stripe_width + 1 #endif ! Adapt stripe width so that last one doesn't get too small #ifdef WITH_OPENMP stripe_width = (thread_width-1)/stripe_count + 1 #else stripe_width = (l_nev-1)/stripe_count + 1 #endif #ifdef DOUBLE_PRECISION_REAL stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes ! (4 * sizeof(double) == 32) #else stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes ! (8 * sizeof(float) == 32) #endif else ! GPUs are used stripe_width = 256 ! Must be a multiple of 4 stripe_count = (l_nev - 1) / stripe_width + 1 endif last_stripe_width = l_nev - (stripe_count-1)*stripe_width endif ! Determine the matrix distribution at the beginning allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when allocating limits"//errorMessage stop endif call determine_workload(na, nbw, np_rows, limits) max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1)) a_dim2 = max_blk_size + nbw if (useGPU) then num = (stripe_width*a_dim2*stripe_count)*M_size_of_PRECISSION_real successCUDA = cuda_malloc(aIntern_dev, stripe_width*a_dim2*stripe_count*M_size_of_PRECISSION_real) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaMalloc"//errorMessage stop endif successCUDA = cuda_memset(aIntern_dev , 0, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaMemset"//errorMessage stop endif else ! GPUs are not used #if 0 !DEC$ ATTRIBUTES ALIGN: 64:: aIntern #endif #ifdef WITH_OPENMP if (posix_memalign(aIntern_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*max_threads*C_SIZEOF(a_real)) /= 0) then print *,"trans_ev_tridi_to_band_real: error when allocating aIntern"//errorMessage stop endif call c_f_pointer(aIntern_ptr, aIntern, [stripe_width,a_dim2,stripe_count,max_threads]) ! allocate(aIntern(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage) ! aIntern(:,:,:,:) should be set to 0 in a parallel region, not here! #else if (posix_memalign(aIntern_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*C_SIZEOF(a_real)) /= 0) then print *,"trans_ev_tridi_to_band_real: error when allocating aIntern"//errorMessage stop endif call c_f_pointer(aIntern_ptr, aIntern,[stripe_width,a_dim2,stripe_count] ) !allocate(aIntern(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage) aIntern(:,:,:) = M_CONST_0_0 #endif /* WITH_OPENMP */ endif !useGPU allocate(row(l_nev), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when allocating row"//errorMessage stop endif row(:) = M_CONST_0_0 if (useGPU) then num = (l_nev)*M_size_of_PRECISSION_real successCUDA = cuda_malloc( row_dev,num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaMalloc "//errorMessage stop endif successCUDA = cuda_memset(row_dev , 0, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaMemset"//errorMessage stop endif ! "row_group" and "row_group_dev" are needed for GPU optimizations allocate(row_group(l_nev, nblk), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when allocating row_group"//errorMessage stop endif row_group(:, :) = M_CONST_0_0 num = (l_nev*nblk)*M_size_of_PRECISSION_real ! call cuda_malloc2d( row_group_dev,l_nev*M_size_of_PRECISSION_real,nblk*size_of_real_datatype) successCUDA = cuda_malloc(row_group_dev, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaMalloc"//errorMessage stop endif successCUDA = cuda_memset(row_group_dev , 0, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaMemset"//errorMessage stop endif endif ! useGPU ! Copy q from a block cyclic distribution into a distribution with contiguous rows, ! and transpose the matrix using stripes of given stripe_width for cache blocking. ! The peculiar way it is done below is due to the fact that the last row should be ! ready first since it is the first one to start below #ifdef WITH_OPENMP ! Please note about the OMP usage below: ! This is not for speed, but because we want the matrix a in the memory and ! in the cache of the correct thread (if possible) #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads aIntern(:,:,:,my_thread) = M_CONST_0_0 ! if possible, do first touch allocation! enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif #endif /* WITH_OPENMP */ do ip = np_rows-1, 0, -1 if (my_prow == ip) then ! Receive my rows which have not yet been received src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1) do i=limits(ip)+1,limits(ip+1) src = mod((i-1)/nblk, np_rows) if (src < my_prow) then #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Recv(row, l_nev, M_MPI_REAL_PRECISSION, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ ! row(1:l_nev) = row(1:l_nev) #endif /* WITH_MPI */ #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif /* HAVE_DETAILED_TIMINGS */ !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads call M_unpack_row_real_cpu_openmp_PRECISSION(aIntern, row, i-limits(ip), my_thread, stripe_count, & thread_width, stripe_width, l_nev) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif /* HAVE_DETAILED_TIMINGS */ #else /* WITH_OPENMP */ if (useGPU) then ! An unpacking of the current row group may occur before queuing the next row call M_unpack_and_prepare_row_group_real_gpu_PRECISSION(row_group, row_group_dev, aIntern_dev, stripe_count, & stripe_width, last_stripe_width, a_dim2, l_nev,& row_group_size, nblk, unpack_idx, i - limits(ip), .false.) #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Recv(row_group(:, row_group_size), l_nev, M_MPI_REAL_PRECISSION, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct? #endif /* WITH_MPI */ else #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Recv(row, l_nev, M_MPI_REAL_PRECISSION, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ ! row(1:l_nev) = row(1:l_nev) #endif /* WITH_MPI */ call M_unpack_row_real_cpu_PRECISSION(aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width) endif #endif /* WITH_OPENMP */ elseif (src==my_prow) then src_offset = src_offset+1 if (.not.(useGPU)) row(:) = q(src_offset, 1:l_nev) #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif /* HAVE_DETAILED_TIMINGS */ if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads call M_unpack_row_real_cpu_openmp_PRECISSION(aIntern, row, i-limits(ip), my_thread, & stripe_count, thread_width, stripe_width, l_nev) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif /* HAVE_DETAILED_TIMINGS */ #else /* WITH_OPENMP */ if (useGPU) then ! An unpacking of the current row group may occur before queuing the next row call M_unpack_and_prepare_row_group_real_gpu_PRECISSION(row_group, row_group_dev, aIntern_dev, stripe_count, & stripe_width, last_stripe_width, a_dim2, l_nev,& row_group_size, nblk, unpack_idx, i - limits(ip), .false.) row_group(:, row_group_size) = q(src_offset, 1:l_nev) else call M_unpack_row_real_cpu_PRECISSION(aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width) endif #endif /* WITH_OPENMP */ endif enddo ! Send all rows which have not yet been send src_offset = 0 do dst = 0, ip-1 do i=limits(dst)+1,limits(dst+1) if (mod((i-1)/nblk, np_rows) == my_prow) then src_offset = src_offset+1 row(:) = q(src_offset, 1:l_nev) #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Send(row, l_nev, M_MPI_REAL_PRECISSION, dst, 0, mpi_comm_rows, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif /* WITH_MPI */ endif enddo enddo else if (my_prow < ip) then ! Send all rows going to PE ip src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1) do i=limits(ip)+1,limits(ip+1) src = mod((i-1)/nblk, np_rows) if (src == my_prow) then src_offset = src_offset+1 row(:) = q(src_offset, 1:l_nev) #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Send(row, l_nev, M_MPI_REAL_PRECISSION, ip, 0, mpi_comm_rows, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif /* WITH_MPI */ endif enddo ! Receive all rows from PE ip do i=limits(my_prow)+1,limits(my_prow+1) src = mod((i-1)/nblk, np_rows) if (src == ip) then #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Recv(row, l_nev, M_MPI_REAL_PRECISSION, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ ! row(1:l_nev) = row(1:l_nev) #endif /* WITH_MPI */ #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif /* HAVE_DETAILED_TIMINGS */ !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads call M_unpack_row_real_cpu_openmp_PRECISSION(aIntern, row, i-limits(my_prow), my_thread, & stripe_count, thread_width, stripe_width, l_nev) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif #else /* WITH_OPENMP */ if (useGPU) then ! An unpacking of the current row group may occur before queuing the next row call M_unpack_and_prepare_row_group_real_gpu_PRECISSION(row_group, row_group_dev, aIntern_dev, stripe_count, & stripe_width, last_stripe_width, a_dim2, l_nev, & row_group_size, nblk, unpack_idx, i - limits(my_prow), .false.) #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Recv(row_group(:, row_group_size), l_nev, M_MPI_REAL_PRECISSION, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else row_group(1:l_nev,row_group_size) = row(1:l_nev) ! is this correct ? #endif else #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Recv(row, l_nev, M_MPI_REAL_PRECISSION, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ ! row(1:l_nev) = row(1:l_nev) #endif call M_unpack_row_real_cpu_PRECISSION(aIntern, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width) endif #endif /* WITH_OPENMP */ endif enddo endif enddo if (useGPU) then ! Force an unpacking of all remaining rows that haven't been unpacked yet call M_unpack_and_prepare_row_group_real_gpu_PRECISSION(row_group, row_group_dev, aIntern_dev, stripe_count, & stripe_width, last_stripe_width, & a_dim2, l_nev, row_group_size, nblk, unpack_idx, -1, .true.) successCUDA = cuda_devicesynchronize() if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaDeviceSynchronize"//errorMessage stop endif endif ! Set up result buffer queue num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows num_result_buffers = 4*nfact allocate(result_buffer(l_nev,nblk,num_result_buffers), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when allocating result_buffer"//errorMessage stop endif allocate(result_send_request(num_result_buffers), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when allocating result_send_request"//errorMessage stop endif allocate(result_recv_request(num_result_buffers), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when allocating result_recv_request"//errorMessage stop endif #ifdef WITH_MPI result_send_request(:) = MPI_REQUEST_NULL result_recv_request(:) = MPI_REQUEST_NULL #endif ! Queue up buffers if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends do j = 1, min(num_result_buffers, num_result_blocks) #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, M_MPI_REAL_PRECISSION, 0, result_recv_tag, & mpi_comm_rows, result_recv_request(j), mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ ! carefull the "recv" has to be done at the corresponding wait or send ! result_buffer(1: l_nev*nblk,1,j) =result_buffer(1:l_nev*nblk,1,nbuf) #endif /* WITH_MPI */ enddo endif num_bufs_recvd = 0 ! No buffers received yet ! Initialize top/bottom requests allocate(top_send_request(stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when allocating top_send_request"//errorMessage stop endif allocate(top_recv_request(stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when allocating top_recv_request"//errorMessage stop endif allocate(bottom_send_request(stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when allocating bottom_send_request"//errorMessage stop endif allocate(bottom_recv_request(stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when allocating bottom_recv_request"//errorMessage stop endif #ifdef WITH_MPI top_send_request(:) = MPI_REQUEST_NULL top_recv_request(:) = MPI_REQUEST_NULL bottom_send_request(:) = MPI_REQUEST_NULL bottom_recv_request(:) = MPI_REQUEST_NULL #endif #ifdef WITH_OPENMP allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when allocating top_border_send_buffer"//errorMessage stop endif allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when allocating top_border_recv_buffer"//errorMessage stop endif allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when allocating bottom_border_send_buffer"//errorMessage stop endif allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when allocating bottom_border_recv_buffer"//errorMessage stop endif top_border_send_buffer(:,:) = M_CONST_0_0 top_border_recv_buffer(:,:) = M_CONST_0_0 bottom_border_send_buffer(:,:) = M_CONST_0_0 bottom_border_recv_buffer(:,:) = M_CONST_0_0 ! Initialize broadcast buffer #else /* WITH_OPENMP */ allocate(top_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when allocating top_border_send_bufer"//errorMessage stop endif allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when allocating top_border_recv_buffer"//errorMessage stop endif allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when allocating bottom_border_send_buffer"//errorMessage stop endif allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when allocating bottom_border_recv_buffer"//errorMessage stop endif top_border_send_buffer(:,:,:) = M_CONST_0_0 top_border_recv_buffer(:,:,:) = M_CONST_0_0 bottom_border_send_buffer(:,:,:) = M_CONST_0_0 bottom_border_recv_buffer(:,:,:) = M_CONST_0_0 #endif /* WITH_OPENMP */ allocate(bcast_buffer(nbw, max_blk_size), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when allocating bcast_buffer"//errorMessage stop endif bcast_buffer = M_CONST_0_0 if (useGPU) then num = ( nbw * max_blk_size) * M_size_of_PRECISSION_real successCUDA = cuda_malloc(bcast_buffer_dev, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaMalloc" stop endif successCUDA = cuda_memset( bcast_buffer_dev, 0, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaMemset" stop endif num = ((max_blk_size-1))*M_size_of_PRECISSION_real successCUDA = cuda_malloc( hh_dot_dev, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaMalloc" stop endif successCUDA = cuda_memset( hh_dot_dev, 0, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaMemset" stop endif num = (max_blk_size)*M_size_of_PRECISSION_real successCUDA = cuda_malloc( hh_tau_dev, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaMalloc" stop endif successCUDA = cuda_memset( hh_tau_dev, 0, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaMemset" stop endif endif ! useGPU current_tv_off = 0 ! Offset of next row to be broadcast ! ------------------- start of work loop ------------------- a_off = 0 ! offset in aIntern (to avoid unnecessary shifts) top_msg_length = 0 bottom_msg_length = 0 do sweep = 0, (na-1)/nbw current_n = na - sweep*nbw call determine_workload(current_n, nbw, np_rows, limits) current_n_start = limits(my_prow) current_n_end = limits(my_prow+1) current_local_n = current_n_end - current_n_start next_n = max(current_n - nbw, 0) call determine_workload(next_n, nbw, np_rows, limits) next_n_start = limits(my_prow) next_n_end = limits(my_prow+1) next_local_n = next_n_end - next_n_start if (next_n_end < next_n) then bottom_msg_length = current_n_end - next_n_end else bottom_msg_length = 0 endif if (next_local_n > 0) then next_top_msg_length = current_n_start - next_n_start else next_top_msg_length = 0 endif if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then do i = 1, stripe_count #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif csw = min(stripe_width, thread_width-(i-1)*stripe_width) ! "current_stripe_width" b_len = csw*nbw*max_threads #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, M_MPI_REAL_PRECISSION, my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ ! carefull the "recieve" has to be done at the corresponding wait or send ! bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) #endif /* WITH_MPI */ #else /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, M_MPI_REAL_PRECISSION, my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ ! carefull the recieve has to be done at the corresponding wait or send ! bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i) #endif /* WITH_MPI */ #endif /* WITH_OPENMP */ enddo endif if (current_local_n > 1) then if (my_pcol == mod(sweep,np_cols)) then bcast_buffer(:,1:current_local_n) = hh_trans_real(:,current_tv_off+1:current_tv_off+current_local_n) current_tv_off = current_tv_off + current_local_n endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_bcast(bcast_buffer, nbw*current_local_n, M_MPI_REAL_PRECISSION, mod(sweep,np_cols), mpi_comm_cols, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif /* WITH_MPI */ if (useGPU) then successCUDA = cuda_memcpy(bcast_buffer_dev, loc(bcast_buffer(1,1)), & nbw * current_local_n * M_size_of_PRECISSION_real , cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy" stop endif call M_extract_hh_tau_real_gpu_PRECISSION(bcast_buffer_dev, hh_tau_dev, nbw, current_local_n, .false.) call M_compute_hh_dot_products_real_gpu_PRECISSION(bcast_buffer_dev, hh_dot_dev, nbw, current_local_n) endif else ! (current_local_n > 1) then ! for current_local_n == 1 the one and only HH vector is 0 and not stored in hh_trans_real bcast_buffer(:,1) = M_CONST_0_0 if (useGPU) then successCUDA = cuda_memset(bcast_buffer_dev, 0, nbw * M_size_of_PRECISSION_real) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaMemset" stop endif call M_extract_hh_tau_real_gpu_PRECISSION(bcast_buffer_dev, hh_tau_dev, nbw, 1, .true.) endif endif ! (current_local_n > 1) then if (l_nev == 0) cycle if (current_local_n > 0) then do i = 1, stripe_count #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif ! Get real stripe width for strip i; ! The last OpenMP tasks may have an even smaller stripe with, ! but we don't care about this, i.e. we send/recv a bit too much in this case. ! csw: current_stripe_width csw = min(stripe_width, thread_width-(i-1)*stripe_width) #endif /* WITH_OPENMP */ !wait_b if (current_n_end < current_n) then #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif /* HAVE_DETAILED_TIMINGS */ !$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1) do my_thread = 1, max_threads n_off = current_local_n+a_off b_len = csw*nbw b_off = (my_thread-1)*b_len aIntern(1:csw,n_off+1:n_off+nbw,i,my_thread) = & reshape(bottom_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, nbw /)) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif /* HAVE_DETAILED_TIMINGS */ #else /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif n_off = current_local_n+a_off if (useGPU) then dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) *M_size_of_PRECISSION_real successCUDA = cuda_memcpy( aIntern_dev + dev_offset , loc(bottom_border_recv_buffer(1,1,i)), & stripe_width*nbw*M_size_of_PRECISSION_real ,cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy" stop endif else aIntern(:,n_off+1:n_off+nbw,i) = bottom_border_recv_buffer(:,1:nbw,i) endif #endif /* WITH_OPENMP */ if (next_n_end < next_n) then #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, & M_MPI_REAL_PRECISSION, my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WTIH_MPI */ ! carefull the recieve has to be done at the corresponding wait or send ! bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) #endif /* WITH_MPI */ #else /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, M_MPI_REAL_PRECISSION, my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ !! carefull the recieve has to be done at the corresponding wait or send !! bottom_border_recv_buffer(1:stripe_width,1:nbw,i) = top_border_send_buffer(1:stripe_width,1:nbw,i) #endif /* WITH_MPI */ #endif /* WITH_OPENMP */ endif endif if (current_local_n <= bottom_msg_length + top_msg_length) then !wait_t if (top_msg_length>0) then #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif if (useGPU) then dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *M_size_of_PRECISSION_real ! host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ) ) * 8 successCUDA = cuda_memcpy( aIntern_dev+dev_offset , loc(top_border_recv_buffer(1,1,i)), & stripe_width*top_msg_length*M_size_of_PRECISSION_real , cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy" stop endif else aIntern(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i) endif ! useGPU #endif /* WITH_OPENMP */ endif ! top_msg_length !compute #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif /* HAVE_DETAILED_TIMINGS */ if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif !$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1) do my_thread = 1, max_threads if (top_msg_length>0) then b_len = csw*top_msg_length b_off = (my_thread-1)*b_len aIntern(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = & reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /)) endif call M_compute_hh_trafo_real_cpu_openmp_PRECISSION(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, & max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, & bcast_buffer_dev, hh_dot_dev, hh_tau_dev, kernel_flops, & kernel_time, 0, current_local_n, i, my_thread, thread_width, & THIS_REAL_ELPA_KERNEL) ! call compute_hh_trafo_real_cpu_openmp(aIntern,stripe_width,a_dim2,stripe_count, max_threads, l_nev, & ! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & !0, current_local_n, i, my_thread, thread_width, & ! THIS_REAL_ELPA_KERNEL) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif /* HAVE_DETAILED_TIMINGS */ #else /* WITH_OPENMP */ call M_compute_hh_trafo_real_cpu_PRECISSION(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, & hh_tau_dev, kernel_flops, kernel_time, 0, current_local_n, i, & last_stripe_width, THIS_REAL_ELPA_KERNEL) #endif /* WITH_OPENMP */ !send_b #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif if (bottom_msg_length>0) then n_off = current_local_n+nbw-bottom_msg_length+a_off b_len = csw*bottom_msg_length*max_threads bottom_border_send_buffer(1:b_len,i) = & reshape(aIntern(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /)) #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Isend(bottom_border_send_buffer(1,i), b_len, M_MPI_REAL_PRECISSION, my_prow+1, & top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ if (next_top_msg_length > 0) then top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = bottom_border_send_buffer(1:csw* & next_top_msg_length*max_threads,i) endif #endif /* WITH_MPI */ endif #else /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif if (bottom_msg_length>0) then n_off = current_local_n+nbw-bottom_msg_length+a_off if (useGPU) then dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *M_size_of_PRECISSION_real successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), aIntern_dev + dev_offset, & stripe_width * bottom_msg_length * M_size_of_PRECISSION_real ,cudaMemcpyDeviceToHost) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy" stop endif else bottom_border_send_buffer(:,1:bottom_msg_length,i) = aIntern(:,n_off+1:n_off+bottom_msg_length,i) endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, M_MPI_REAL_PRECISSION, my_prow+1, & top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ if (next_top_msg_length > 0) then top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) = & bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i) endif #endif /* WITH_MPI */ endif #endif /* WITH_OPENMP */ else ! current_local_n <= bottom_msg_length + top_msg_length !compute #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif /* HAVE_DETAILED_TIMINGS */ !$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1) do my_thread = 1, max_threads call M_compute_hh_trafo_real_cpu_openmp_PRECISSION(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, & max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, & bcast_buffer_dev, hh_dot_dev, hh_tau_dev, kernel_flops, & kernel_time, current_local_n - bottom_msg_length, & bottom_msg_length, i, my_thread, thread_width, THIS_REAL_ELPA_KERNEL) ! call compute_hh_trafo_real_cpu_openmp(aIntern, stripe_width,a_dim2,stripe_count, max_threads, l_nev, & ! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & !current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, thread_width, & ! THIS_REAL_ELPA_KERNEL) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif /* HAVE_DETAILED_TIMINGS */ !send_b #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif if (bottom_msg_length > 0) then n_off = current_local_n+nbw-bottom_msg_length+a_off b_len = csw*bottom_msg_length*max_threads bottom_border_send_buffer(1:b_len,i) = & reshape(aIntern(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /)) #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Isend(bottom_border_send_buffer(1,i), b_len, M_MPI_REAL_PRECISSION, my_prow+1, & top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ if (next_top_msg_length > 0) then top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = bottom_border_send_buffer(1:csw* & next_top_msg_length*& max_threads,i) endif #endif /* WITH_MPI */ endif #else /* WITH_OPENMP */ call M_compute_hh_trafo_real_cpu_PRECISSION(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, & hh_tau_dev, kernel_flops, kernel_time, & current_local_n - bottom_msg_length, bottom_msg_length, i, & last_stripe_width, THIS_REAL_ELPA_KERNEL) !send_b #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif if (bottom_msg_length > 0) then n_off = current_local_n+nbw-bottom_msg_length+a_off if (useGPU) then dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *M_size_of_PRECISSION_real successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), aIntern_dev + dev_offset, & stripe_width*bottom_msg_length*M_size_of_PRECISSION_real ,cudaMemcpyDeviceToHost) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error cudaMemcpy" stop endif else bottom_border_send_buffer(:,1:bottom_msg_length,i) = aIntern(:,n_off+1:n_off+bottom_msg_length,i) endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, M_MPI_REAL_PRECISSION, my_prow+1, & top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ if (next_top_msg_length > 0) then top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) = & bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i) endif #endif /* WITH_MPI */ endif #endif /* WITH_OPENMP */ !compute #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif /* HAVE_DETAILED_TIMINGS */ !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads call M_compute_hh_trafo_real_cpu_openmp_PRECISSION(aIntern, aIntern_dev, stripe_width ,a_dim2, stripe_count, & max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, & bcast_buffer_dev, hh_dot_dev, hh_tau_dev, kernel_flops, & kernel_time, top_msg_length, & current_local_n-top_msg_length-bottom_msg_length, i, my_thread, & thread_width, THIS_REAL_ELPA_KERNEL) ! call compute_hh_trafo_real_cpu_openmp(aIntern, stripe_width, a_dim2,stripe_count, max_threads, l_nev, & ! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & ! top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, my_thread, thread_width, & ! THIS_REAL_ELPA_KERNEL) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif /* HAVE_DETAILED_TIMINGS */ #else /* WITH_OPENMP */ call M_compute_hh_trafo_real_cpu_PRECISSION(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, & hh_tau_dev, kernel_flops, kernel_time, top_msg_length, & current_local_n-top_msg_length-bottom_msg_length, i, & last_stripe_width, THIS_REAL_ELPA_KERNEL) #endif /* WITH_OPENMP */ !wait_t if (top_msg_length>0) then #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif if (useGPU) then dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *M_size_of_PRECISSION_real successCUDA = cuda_memcpy( aIntern_dev + dev_offset , loc( top_border_recv_buffer(:,1,i)), & stripe_width * top_msg_length *M_size_of_PRECISSION_real ,cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy" stop endif else aIntern(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i) endif #endif /* WITH_OPENMP */ endif !compute #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif /* HAVE_DETAILED_TIMINGS */ !$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1) do my_thread = 1, max_threads if (top_msg_length>0) then b_len = csw*top_msg_length b_off = (my_thread-1)*b_len aIntern(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = & reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /)) endif call M_compute_hh_trafo_real_cpu_openmp_PRECISSION(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, & max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, & bcast_buffer_dev, hh_dot_dev, hh_tau_dev, kernel_flops, & kernel_time, 0, top_msg_length, i, my_thread, thread_width, & THIS_REAL_ELPA_KERNEL) ! call compute_hh_trafo_real_cpu_openmp(aIntern, stripe_width,a_dim2,stripe_count, max_threads, l_nev, & ! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & ! 0, top_msg_length, i, my_thread, thread_width, THIS_REAL_ELPA_KERNEL) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif /* HAVE_DETAILED_TIMINGS */ #else /* WITH_OPENMP */ call M_compute_hh_trafo_real_cpu_PRECISSION(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, & hh_tau_dev, kernel_flops, kernel_time, 0, top_msg_length, i, & last_stripe_width, THIS_REAL_ELPA_KERNEL) #endif /* WITH_OPENMP */ endif if (next_top_msg_length > 0) then !request top_border data #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif b_len = csw*next_top_msg_length*max_threads #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Irecv(top_border_recv_buffer(1,i), b_len, M_MPI_REAL_PRECISSION, my_prow-1, & top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ ! carefull the "recieve" has to be done at the corresponding wait or send ! top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = & ! bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i) #endif /* WITH_MPI */ #else /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, M_MPI_REAL_PRECISSION, my_prow-1, & top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ ! carefull the "recieve" has to be done at the corresponding wait or send ! top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) = & ! bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i) #endif /* WITH_MPI */ #endif /* WITH_OPENMP */ endif !send_t if (my_prow > 0) then #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif b_len = csw*nbw*max_threads top_border_send_buffer(1:b_len,i) = reshape(aIntern(1:csw,a_off+1:a_off+nbw,i,:), (/ b_len /)) #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Isend(top_border_send_buffer(1,i), b_len, M_MPI_REAL_PRECISSION, & my_prow-1, bottom_recv_tag, & mpi_comm_rows, top_send_request(i), mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) endif if (next_n_end < next_n) then bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) endif #endif /* WITH_MPI */ #else /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif if (useGPU) then dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * M_size_of_PRECISSION_real successCUDA = cuda_memcpy( loc(top_border_send_buffer(:,1,i)), aIntern_dev + dev_offset, & stripe_width*nbw*M_size_of_PRECISSION_real ,cudaMemcpyDeviceToHost) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy" stop endif else top_border_send_buffer(:,1:nbw,i) = aIntern(:,a_off+1:a_off+nbw,i) endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, M_MPI_REAL_PRECISSION, my_prow-1, bottom_recv_tag, & mpi_comm_rows, top_send_request(i), mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i) endif if (next_n_end < next_n) then bottom_border_recv_buffer(1:stripe_width,1:nbw,i) = top_border_send_buffer(1:stripe_width,1:nbw,i) endif #endif /* WITH_MPI */ #endif /* WITH_OPENMP */ endif ! Care that there are not too many outstanding top_recv_request's if (stripe_count > 1) then if (i>1) then #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif #endif /* WITH_OPENMP */ else #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif #endif /* WITH_OPENMP */ endif endif enddo top_msg_length = next_top_msg_length else ! wait for last top_send_request do i = 1, stripe_count #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif #endif /* WITH_OPENMP */ enddo endif ! Care about the result if (my_prow == 0) then ! topmost process sends nbw rows to destination processes do j=0, nfact-1 num_blk = sweep*nfact+j ! global number of destination block, 0 based if (num_blk*nblk >= na) exit nbuf = mod(num_blk, num_result_buffers) + 1 ! buffer number to get this block #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif #endif /* WITH_OPENMP */ dst = mod(num_blk, np_rows) if (dst == 0) then if (useGPU) then row_group_size = min(na - num_blk*nblk, nblk) call M_pack_row_group_real_gpu_PRECISSION(row_group_dev, aIntern_dev, stripe_count, stripe_width, & last_stripe_width, a_dim2, l_nev, & row_group(:, :), j * nblk + a_off, row_group_size) do i = 1, row_group_size q((num_blk / np_rows) * nblk + i, 1 : l_nev) = row_group(:, i) enddo else ! useGPU do i = 1, min(na - num_blk*nblk, nblk) #ifdef WITH_OPENMP call M_pack_row_real_cpu_openmp_PRECISSION(aIntern, row, j*nblk+i+a_off, stripe_width, & stripe_count, max_threads, thread_width, l_nev) #else call M_pack_row_real_cpu_PRECISSION(aIntern, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count) #endif q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:) enddo endif ! useGPU else ! (dst == 0) if (useGPU) then call M_pack_row_group_real_gpu_PRECISSION(row_group_dev, aIntern_dev, stripe_count, stripe_width, & last_stripe_width, a_dim2, l_nev, & result_buffer(:, :, nbuf), j * nblk + a_off, nblk) else ! useGPU do i = 1, nblk #ifdef WITH_OPENMP call M_pack_row_real_cpu_openmp_PRECISSION(aIntern, result_buffer(:,i,nbuf), j*nblk+i+a_off, & stripe_width, stripe_count, max_threads, thread_width, l_nev) #else call M_pack_row_real_cpu_PRECISSION(aIntern, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, & last_stripe_width, stripe_count) #endif enddo endif ! useGPU #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, M_MPI_REAL_PRECISSION, dst, & result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ if (j+num_result_buffers < num_result_blocks) & result_buffer(1:l_nev,1:nblk,nbuf) = result_buffer(1:l_nev,1:nblk,nbuf) if (my_prow > 0 .and. l_nev>0) then do j1 = 1, min(num_result_buffers, num_result_blocks) result_buffer(1:l_nev,1:nblk,j1) = result_buffer(1:l_nev,1:nblk,nbuf) enddo endif #endif /* WITH_MPI */ endif ! (dst == 0) enddo !j=0, nfact-1 else ! (my_prow == 0) ! receive and store final result do j = num_bufs_recvd, num_result_blocks-1 nbuf = mod(j, num_result_buffers) + 1 ! buffer number to get this block ! If there is still work to do, just test for the next result request ! and leave the loop if it is not ready, otherwise wait for all ! outstanding requests if (next_local_n > 0) then #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ flag = .true. #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ flag = .true. #endif #endif /* WITH_OPENMP */ if (.not.flag) exit else ! (next_local_n > 0) #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif #endif /* WITH_OPENMP */ endif ! (next_local_n > 0) ! Fill result buffer into q num_blk = j*np_rows + my_prow ! global number of current block, 0 based do i = 1, min(na - num_blk*nblk, nblk) q(j*nblk+i, 1:l_nev) = result_buffer(1:l_nev, i, nbuf) enddo ! Queue result buffer again if there are outstanding blocks left #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif if (j+num_result_buffers < num_result_blocks) & call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, M_MPI_REAL_PRECISSION, 0, result_recv_tag, & mpi_comm_rows, result_recv_request(nbuf), mpierr) ! carefull the "recieve" has to be done at the corresponding wait or send ! if (j+num_result_buffers < num_result_blocks) & ! result_buffer(1:l_nev*nblk,1,nbuf) = result_buffer(1:l_nev*nblk,1,nbuf) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ #endif /* WITH_MPI */ enddo ! j = num_bufs_recvd, num_result_blocks-1 num_bufs_recvd = j endif ! (my_prow == 0) ! Shift the remaining rows to the front of aIntern (if necessary) offset = nbw - top_msg_length if (offset<0) then if (wantDebug) write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_real: internal error, offset for shifting = ',offset success = .false. return endif a_off = a_off + offset if (a_off + next_local_n + nbw > a_dim2) then #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif /* HAVE_DETAILED_TIMINGS */ !$omp parallel do private(my_thread, i, j), schedule(static, 1) do my_thread = 1, max_threads do i = 1, stripe_count do j = top_msg_length+1, top_msg_length+next_local_n aIntern(:,j,i,my_thread) = aIntern(:,j+a_off,i,my_thread) enddo enddo enddo #else /* WITH_OPENMP */ do i = 1, stripe_count if (useGPU) then chunk = min(next_local_n - 1, a_off) do j = top_msg_length + 1, top_msg_length + next_local_n, chunk top = min(j + chunk, top_msg_length + next_local_n) this_chunk = top - j + 1 dev_offset = (0 + ( (j-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *M_size_of_PRECISSION_real dev_offset_1 = (0 + ( (j + a_off-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * & M_size_of_PRECISSION_real ! it is not logical to set here always the value for the parameter ! "cudaMemcpyDeviceToDevice" do this ONCE at startup ! tmp = cuda_d2d(1) successCUDA = cuda_memcpy( aIntern_dev + dev_offset , aIntern_dev +dev_offset_1, & stripe_width*this_chunk*M_size_of_PRECISSION_real, cudaMemcpyDeviceToDevice) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error cudaMemcpy" stop endif enddo else ! not useGPU do j = top_msg_length+1, top_msg_length+next_local_n aIntern(:,j,i) = aIntern(:,j+a_off,i) enddo endif enddo ! stripe_count #endif /* WITH_OPENMP */ #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel" + M_PRECISSION_SUFFIX) #endif /* HAVE_DETAILED_TIMINGS */ #endif /* WITH_OPENMP */ a_off = 0 endif enddo ! Just for safety: #ifdef WITH_MPI if (ANY(top_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_send_request ***',my_prow,my_pcol if (ANY(bottom_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_send_request ***',my_prow,my_pcol if (ANY(top_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_recv_request ***',my_prow,my_pcol if (ANY(bottom_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_recv_request ***',my_prow,my_pcol #endif if (my_prow == 0) then #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop endif #ifdef WITH_MPI ! allocate(mpi_statuses(MPI_STATUS_SIZE,num_result_buffers), stat=istat, errmsg=errorMessage) ! if (istat .ne. 0) then ! print *,"trans_ev_tridi_to_band_real: error when allocating mpi_statuses"//errorMessage ! stop ! endif call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr) ! deallocate(mpi_statuses, stat=istat, errmsg=errorMessage) ! if (istat .ne. 0) then ! print *,"trans_ev_tridi_to_band_real: error when deallocating mpi_statuses"//errorMessage ! stop ! endif #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif #endif /* WITH_OPENMP */ endif #ifdef WITH_MPI if (ANY(result_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_send_request ***',my_prow,my_pcol if (ANY(result_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_recv_request ***',my_prow,my_pcol #ifdef HAVE_DETAILED_TIMINGS call MPI_ALLREDUCE(kernel_flops, kernel_flops_recv, 1, MPI_INTEGER8, MPI_SUM, MPI_COMM_ROWS, mpierr) kernel_flops = kernel_flops_recv call MPI_ALLREDUCE(kernel_flops, kernel_flops_recv, 1, MPI_INTEGER8, MPI_SUM, MPI_COMM_COLS, mpierr) kernel_flops = kernel_flops_recv call MPI_ALLREDUCE(kernel_time, kernel_time_recv, 1, MPI_REAL8, MPI_MAX, MPI_COMM_ROWS, mpierr) kernel_time_recv = kernel_time call MPI_ALLREDUCE(kernel_time, kernel_time_recv, 1, MPI_REAL8, MPI_MAX, MPI_COMM_COLS, mpierr) kernel_time_recv = kernel_time #endif #else /* WITH_MPI */ if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,'(" Kernel time:",f10.3," MFlops: ",es12.5)') kernel_time, kernel_flops/kernel_time*1.d-6 #endif /* WITH_MPI */ ! copy q to q_dev needed in trans_ev_band_to_full successCUDA = cuda_malloc(q_dev, ldq*matrixCols*M_size_of_PRECISSION_real) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaMalloc" stop endif ! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)*M_size_of_PRECISSION_real, cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaMalloc" stop endif ! deallocate all working space if (.not.(useGPU)) then nullify(aIntern) call free(aIntern_ptr) ! deallocate(aIntern, stat=istat, errmsg=errorMessage) ! if (istat .ne. 0) then ! print *,"trans_ev_tridi_to_band_real: error when deallocating aIntern "//errorMessage ! stop ! endif endif deallocate(row, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when deallocating row "//errorMessage stop endif deallocate(limits, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when deallocating limits"//errorMessage stop endif deallocate(result_send_request, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when deallocating result_send_request "//errorMessage stop endif deallocate(result_recv_request, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when deallocating result_recv_request "//errorMessage stop endif deallocate(top_border_send_buffer, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when deallocating top_border_send_buffer "//errorMessage stop endif deallocate(top_border_recv_buffer, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when deallocating top_border_recv_buffer "//errorMessage stop endif deallocate(bottom_border_send_buffer, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when deallocating bottom_border_send_buffer "//errorMessage stop endif deallocate(bottom_border_recv_buffer, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when deallocating bottom_border_recv_buffer "//errorMessage stop endif deallocate(result_buffer, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when deallocating result_buffer "//errorMessage stop endif deallocate(bcast_buffer, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when deallocating bcast_buffer "//errorMessage stop endif deallocate(top_send_request, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when deallocating top_send_request "//errorMessage stop endif deallocate(top_recv_request, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when deallocating top_recv_request "//errorMessage stop endif deallocate(bottom_send_request, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when deallocating bottom_send_request "//errorMessage stop endif deallocate(bottom_recv_request, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when deallocating bottom_recv_request "//errorMessage stop endif if (useGPU) then successCUDA = cuda_free(hh_dot_dev) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaFree "//errorMessage stop endif successCUDA = cuda_free(hh_tau_dev) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaFree "//errorMessage stop endif successCUDA = cuda_free(row_dev) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaFree "//errorMessage stop endif deallocate(row_group, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_real: error when deallocating row_group "//errorMessage stop endif successCUDA = cuda_free(row_group_dev) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaFree "//errorMessage stop endif successCUDA = cuda_free(bcast_buffer_dev) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_real: error in cudaFree "//errorMessage stop endif endif ! useGPU #ifdef HAVE_DETAILED_TIMINGS call timer%stop("trans_ev_tridi_to_band_real" + M_PRECISSION_SUFFIX) #endif /* HAVE_DETAILED_TIMINGS */ return end subroutine M_trans_ev_tridi_to_band_real_PRECISSION #ifndef ALREADY_DEFINED_WORKLOAD subroutine determine_workload(na, nb, nprocs, limits) #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik), intent(in) :: na, nb, nprocs integer(kind=ik), intent(out) :: limits(0:nprocs) integer(kind=ik) :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("determine_workload") #endif if (na <= 0) then limits(:) = 0 #ifdef HAVE_DETAILED_TIMINGS call timer%stop("determine_workload") #endif return endif if (nb*nprocs > na) then ! there is not enough work for all do i = 0, nprocs limits(i) = min(na, i*nb) enddo else do i = 0, nprocs limits(i) = (i*na)/nprocs enddo endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("determine_workload") #endif end subroutine !--------------------------------------------------------------------------------------------------- ! divide_band: sets the work distribution in band ! Proc n works on blocks block_limits(n)+1 .. block_limits(n+1) subroutine divide_band(nblocks_total, n_pes, block_limits) #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik), intent(in) :: nblocks_total ! total number of blocks in band integer(kind=ik), intent(in) :: n_pes ! number of PEs for division integer(kind=ik), intent(out) :: block_limits(0:n_pes) integer(kind=ik) :: n, nblocks, nblocks_left #ifdef HAVE_DETAILED_TIMINGS call timer%start("divide_band") #endif block_limits(0) = 0 if (nblocks_total < n_pes) then ! Not enough work for all: The first tasks get exactly 1 block do n=1,n_pes block_limits(n) = min(nblocks_total,n) enddo else ! Enough work for all. If there is no exact loadbalance, ! the LAST tasks get more work since they are finishing earlier! nblocks = nblocks_total/n_pes nblocks_left = nblocks_total - n_pes*nblocks do n=1,n_pes if (n<=n_pes-nblocks_left) then block_limits(n) = block_limits(n-1) + nblocks else block_limits(n) = block_limits(n-1) + nblocks + 1 endif enddo endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("divide_band") #endif end subroutine #define ALREADY_DEFINED_WORKLOAD 1 #endif /* ALREADY_DEFINED_WORKLOAD */ subroutine M_band_band_real_PRECISSION(na, nb, nbCol, nb2, nb2Col, ab, ab2, d, e, mpi_comm) !------------------------------------------------------------------------------- ! band_band_real: ! Reduces a real symmetric banded matrix to a real symmetric matrix with smaller bandwidth. Householder transformations are not stored. ! Matrix size na and original bandwidth nb have to be a multiple of the target bandwidth nb2. (Hint: expand your matrix with ! zero entries, if this ! requirement doesn't hold) ! ! na Order of matrix ! ! nb Semi bandwidth of original matrix ! ! nb2 Semi bandwidth of target matrix ! ! ab Input matrix with bandwidth nb. The leading dimension of the banded matrix has to be 2*nb. The parallel data layout ! has to be accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb+1 to min(na, block_limits(n+1)*nb) ! are located on rank n. ! ! ab2 Output matrix with bandwidth nb2. The leading dimension of the banded matrix is 2*nb2. The parallel data layout is ! accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb2+1 to min(na, block_limits(n+1)*nb2) are located ! on rank n. ! ! d(na) Diagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output) ! ! e(na) Subdiagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output) ! ! mpi_comm ! MPI-Communicator for the total processor set !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik), intent(in) :: na, nb, nbCol, nb2, nb2Col, mpi_comm real(kind=REAL_DATATYPE), intent(inout) :: ab(2*nb,nbCol) ! removed assumed size real(kind=REAL_DATATYPE), intent(inout) :: ab2(2*nb2,nb2Col) ! removed assumed size real(kind=REAL_DATATYPE), intent(out) :: d(na), e(na) ! set only on PE 0 real(kind=REAL_DATATYPE) :: hv(nb,nb2), w(nb,nb2), w_new(nb,nb2), tau(nb2), hv_new(nb,nb2), & tau_new(nb2), ab_s(1+nb,nb2), ab_r(1+nb,nb2), ab_s2(2*nb2,nb2), hv_s(nb,nb2) real(kind=REAL_DATATYPE) :: work(nb*nb2), work2(nb2*nb2) integer(kind=ik) :: lwork, info integer(kind=ik) :: istep, i, n, dest integer(kind=ik) :: n_off, na_s integer(kind=ik) :: my_pe, n_pes, mpierr integer(kind=ik) :: nblocks_total, nblocks integer(kind=ik) :: nblocks_total2, nblocks2 integer(kind=ik) :: ireq_ab, ireq_hv #ifdef WITH_MPI ! integer(kind=ik) :: MPI_STATUS_IGNORE(MPI_STATUS_SIZE) #endif ! integer(kind=ik), allocatable :: mpi_statuses(:,:) integer(kind=ik), allocatable :: block_limits(:), block_limits2(:), ireq_ab2(:) integer(kind=ik) :: j, nc, nr, ns, ne, iblk integer(kind=ik) :: istat character(200) :: errorMessage #ifdef HAVE_DETAILED_TIMINGS call timer%start("band_band_real" + M_PRECISSION_SUFFIX) #endif ! if (na .lt. 2*nb) then ! print *,"na lt 2*nb ",na,2*nb ! stop ! endif ! if (na .lt. 2*nb2) then ! print *,"na lt 2*nb2 ",na,2*nb2 ! stop ! endif ! if (na .lt. nbCol) then ! print *,"na lt nbCol ",na,nbCol ! stop ! endif ! if (na .lt. nb2Col) then ! print *,"na lt nb2Col ",na,nb2Col ! stop ! endif #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_comm_rank(mpi_comm,my_pe,mpierr) call mpi_comm_size(mpi_comm,n_pes,mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif ! Total number of blocks in the band: nblocks_total = (na-1)/nb + 1 nblocks_total2 = (na-1)/nb2 + 1 ! Set work distribution allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"error allocating block_limits "//errorMessage stop endif call divide_band(nblocks_total, n_pes, block_limits) allocate(block_limits2(0:n_pes), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"error allocating block_limits2 "//errorMessage stop endif call divide_band(nblocks_total2, n_pes, block_limits2) ! nblocks: the number of blocks for my task nblocks = block_limits(my_pe+1) - block_limits(my_pe) nblocks2 = block_limits2(my_pe+1) - block_limits2(my_pe) allocate(ireq_ab2(1:nblocks2), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"error allocating ireq_ab2 "//errorMessage stop endif #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif ireq_ab2 = MPI_REQUEST_NULL if (nb2>1) then do i=0,nblocks2-1 call mpi_irecv(ab2(1,i*nb2+1), 2*nb2*nb2, M_MPI_REAL_PRECISSION, 0, 3, mpi_comm, ireq_ab2(i+1), mpierr) enddo endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ ! carefull the "recieve" has to be done at the corresponding send or wait ! if (nb2>1) then ! do i=0,nblocks2-1 ! ab2(1:2*nb2*nb2,i*nb2+1:i*nb2+1+nb2-1) = ab_s2(1:2*nb2,i*nb2+1:nb2) ! enddo ! endif #endif /* WITH_MPI */ ! n_off: Offset of ab within band n_off = block_limits(my_pe)*nb lwork = nb*nb2 dest = 0 #ifdef WITH_MPI ireq_ab = MPI_REQUEST_NULL ireq_hv = MPI_REQUEST_NULL #endif ! --------------------------------------------------------------------------- ! Start of calculations na_s = block_limits(my_pe)*nb + 1 if (my_pe>0 .and. na_s<=na) then ! send first nb2 columns to previous PE ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also) do i=1,nb2 ab_s(1:nb+1,i) = ab(1:nb+1,na_s-n_off+i-1) enddo #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_isend(ab_s, (nb+1)*nb2, M_MPI_REAL_PRECISSION, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif /* WITH_MPI */ endif do istep=1,na/nb2 if (my_pe==0) then n = MIN(na-na_s-nb2+1,nb) ! number of rows to be reduced hv(:,:) = M_CONST_0_0 tau(:) = M_CONST_0_0 ! The last step (istep=na-1) is only needed for sending the last HH vectors. ! We don't want the sign of the last element flipped (analogous to the other sweeps) if (istep < na/nb2) then ! Transform first block column of remaining matrix call M_PRECISSION_GEQRF(n, nb2, ab(1+nb2,na_s-n_off), 2*nb-1, tau, work, lwork, info) do i=1,nb2 hv(i,i) = M_CONST_1_0 hv(i+1:n,i) = ab(1+nb2+1:1+nb2+n-i,na_s-n_off+i-1) ab(1+nb2+1:2*nb,na_s-n_off+i-1) = M_CONST_0_0 enddo endif if (nb2==1) then d(istep) = ab(1,na_s-n_off) e(istep) = ab(2,na_s-n_off) if (istep == na) then e(na) = M_CONST_0_0 endif else ab_s2 = M_CONST_0_0 ab_s2(:,:) = ab(1:nb2+1,na_s-n_off:na_s-n_off+nb2-1) if (block_limits2(dest+1)1) then do i= 0,nblocks2-1 ab2(1:2*nb2*nb2,i*nb2+1:i+nb2+1+nb2-1) = ab_s2(1:2*nb2,1:nb2) enddo endif #endif /* WITH_MPI */ endif else if (na>na_s+nb2-1) then ! Receive Householder vectors from previous task, from PE owning subdiagonal #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_recv(hv, nb*nb2, M_MPI_REAL_PRECISSION, my_pe-1, 2, mpi_comm, MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ hv(1:nb,1:nb2) = hv_s(1:nb,1:nb2) #endif /* WITH_MPI */ do i=1,nb2 tau(i) = hv(i,i) hv(i,i) = M_CONST_1_0 enddo endif endif na_s = na_s+nb2 if (na_s-n_off > nb) then ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb) ab(:,nblocks*nb+1:(nblocks+1)*nb) = M_CONST_0_0 n_off = n_off + nb endif do iblk=1,nblocks ns = na_s + (iblk-1)*nb - n_off ! first column in block ne = ns+nb-nb2 ! last column in block if (ns+n_off>na) exit nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) ! Note that nr>=0 implies that diagonal block is full (nc==nb)! call M_wy_gen_PRECISSION(nc,nb2,w,hv,tau,work,nb) if (iblk==nblocks .and. nc==nb) then !request last nb2 columns #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_recv(ab_r,(nb+1)*nb2, M_MPI_REAL_PRECISSION, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ ab_r(1:nb+1,1:nb2) = ab_s(1:nb+1,1:nb2) #endif /* WITH_MPI */ do i=1,nb2 ab(1:nb+1,ne+i-1) = ab_r(:,i) enddo endif hv_new(:,:) = M_CONST_0_0 ! Needed, last rows must be 0 for nr < nb tau_new(:) = M_CONST_0_0 if (nr>0) then call M_wy_right_PRECISSION(nr,nb,nb2,ab(nb+1,ns),2*nb-1,w,hv,work,nb) call M_PRECISSION_GEQRF(nr, nb2, ab(nb+1,ns), 2*nb-1, tau_new, work, lwork, info) do i=1,nb2 hv_new(i,i) = M_CONST_1_0 hv_new(i+1:,i) = ab(nb+2:2*nb-i+1,ns+i-1) ab(nb+2:,ns+i-1) = M_CONST_0_0 enddo !send hh-vector if (iblk==nblocks) then #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif hv_s = hv_new do i=1,nb2 hv_s(i,i) = tau_new(i) enddo #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_isend(hv_s,nb*nb2, M_MPI_REAL_PRECISSION, my_pe+1, 2, mpi_comm, ireq_hv, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ #endif /* WITH_MPI */ endif endif call M_wy_symm_PRECISSION(nc,nb2,ab(1,ns),2*nb-1,w,hv,work,work2,nb) if (my_pe>0 .and. iblk==1) then !send first nb2 columns to previous PE #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif do i=1,nb2 ab_s(1:nb+1,i) = ab(1:nb+1,ns+i-1) enddo #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_isend(ab_s,(nb+1)*nb2, M_MPI_REAL_PRECISSION, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #else /* WITH_MPI */ #endif /* WITH_MPI */ endif if (nr>0) then call M_wy_gen_PRECISSION(nr,nb2,w_new,hv_new,tau_new,work,nb) call M_wy_left_PRECISSION(nb-nb2,nr,nb2,ab(nb+1-nb2,ns+nb2),2*nb-1,w_new,hv_new,work,nb) endif ! Use new HH vector for the next block hv(:,:) = hv_new(:,:) tau = tau_new enddo enddo ! Finish the last outstanding requests #ifdef WITH_MPI #ifdef HAVE_DETAILED_TIMINGS call timer%start("mpi_communication") #endif call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) ! allocate(mpi_statuses(MPI_STATUS_SIZE,nblocks2), stat=istat, errmsg=errorMessage) ! if (istat .ne. 0) then ! print *,"error allocating mpi_statuses "//errorMessage ! stop ! endif call mpi_waitall(nblocks2,ireq_ab2,MPI_STATUSES_IGNORE,mpierr) ! deallocate(mpi_statuses, stat=istat, errmsg=errorMessage) ! if (istat .ne. 0) then ! print *,"error deallocating mpi_statuses "//errorMessage ! stop ! endif call mpi_barrier(mpi_comm,mpierr) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mpi_communication") #endif #endif /* WITH_MPI */ deallocate(block_limits, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"error deallocating block_limits "//errorMessage stop endif deallocate(block_limits2, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"error deallocating block_limits2 "//errorMessage stop endif deallocate(ireq_ab2, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"error deallocating ireq_ab2 "//errorMessage stop endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("band_band_real" + M_PRECISSION_SUFFIX) #endif end subroutine subroutine M_wy_gen_PRECISSION(n, nb, W, Y, tau, mem, lda) #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik), intent(in) :: n !length of householder-vectors integer(kind=ik), intent(in) :: nb !number of householder-vectors integer(kind=ik), intent(in) :: lda !leading dimension of Y and W real(kind=REAL_DATATYPE), intent(in) :: Y(lda,nb) !matrix containing nb householder-vectors of length b real(kind=REAL_DATATYPE), intent(in) :: tau(nb) !tau values real(kind=REAL_DATATYPE), intent(out) :: W(lda,nb) !output matrix W real(kind=REAL_DATATYPE), intent(in) :: mem(nb) !memory for a temporary matrix of size nb integer(kind=ik) :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("wy_gen" + M_PRECISSION_SUFFIX) #endif W(1:n,1) = tau(1)*Y(1:n,1) do i=2,nb W(1:n,i) = tau(i)*Y(1:n,i) call M_PRECISSION_GEMV('T', n, i-1, M_CONST_1_0, Y, lda, W(1,i), 1, M_CONST_0_0, mem,1) call M_PRECISSION_GEMV('N', n, i-1, -M_CONST_1_0, W, lda, mem, 1, M_CONST_1_0, W(1,i),1) enddo #ifdef HAVE_DETAILED_TIMINGS call timer%stop("wy_gen" + M_PRECISSION_SUFFIX) #endif end subroutine subroutine M_wy_left_PRECISSION(n, m, nb, A, lda, W, Y, mem, lda2) #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik), intent(in) :: n !width of the matrix A integer(kind=ik), intent(in) :: m !length of matrix W and Y integer(kind=ik), intent(in) :: nb !width of matrix W and Y integer(kind=ik), intent(in) :: lda !leading dimension of A integer(kind=ik), intent(in) :: lda2 !leading dimension of W and Y real(kind=REAL_DATATYPE), intent(inout) :: A(lda,*) !matrix to be transformed ! remove assumed size real(kind=REAL_DATATYPE), intent(in) :: W(m,nb) !blocked transformation matrix W real(kind=REAL_DATATYPE), intent(in) :: Y(m,nb) !blocked transformation matrix Y real(kind=REAL_DATATYPE), intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb #ifdef HAVE_DETAILED_TIMINGS call timer%start("wy_left" + M_PRECISSION_SUFFIX) #endif call M_PRECISSION_GEMM('T', 'N', nb, n, m, M_CONST_1_0, W, lda2, A, lda, M_CONST_0_0, mem, nb) call M_PRECISSION_GEMM('N', 'N', m, n, nb, -M_CONST_1_0, Y, lda2, mem, nb, M_CONST_1_0, A, lda) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("wy_left" + M_PRECISSION_SUFFIX) #endif end subroutine subroutine M_wy_right_PRECISSION(n, m, nb, A, lda, W, Y, mem, lda2) #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik), intent(in) :: n !height of the matrix A integer(kind=ik), intent(in) :: m !length of matrix W and Y integer(kind=ik), intent(in) :: nb !width of matrix W and Y integer(kind=ik), intent(in) :: lda !leading dimension of A integer(kind=ik), intent(in) :: lda2 !leading dimension of W and Y real(kind=REAL_DATATYPE), intent(inout) :: A(lda,*) !matrix to be transformed ! remove assumed size real(kind=REAL_DATATYPE), intent(in) :: W(m,nb) !blocked transformation matrix W real(kind=REAL_DATATYPE), intent(in) :: Y(m,nb) !blocked transformation matrix Y real(kind=REAL_DATATYPE), intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb #ifdef HAVE_DETAILED_TIMINGS call timer%start("wy_right" + M_PRECISSION_SUFFIX) #endif call M_PRECISSION_GEMM('N', 'N', n, nb, m, M_CONST_1_0, A, lda, W, lda2, M_CONST_0_0, mem, n) call M_PRECISSION_GEMM('N', 'T', n, m, nb, -M_CONST_1_0, mem, n, Y, lda2, M_CONST_1_0, A, lda) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("wy_right" + M_PRECISSION_SUFFIX) #endif end subroutine subroutine M_wy_symm_PRECISSION(n, nb, A, lda, W, Y, mem, mem2, lda2) #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik), intent(in) :: n !width/heigth of the matrix A; length of matrix W and Y integer(kind=ik), intent(in) :: nb !width of matrix W and Y integer(kind=ik), intent(in) :: lda !leading dimension of A integer(kind=ik), intent(in) :: lda2 !leading dimension of W and Y real(kind=REAL_DATATYPE), intent(inout) :: A(lda,*) !matrix to be transformed ! remove assumed size real(kind=REAL_DATATYPE), intent(in) :: W(n,nb) !blocked transformation matrix W real(kind=REAL_DATATYPE), intent(in) :: Y(n,nb) !blocked transformation matrix Y real(kind=REAL_DATATYPE) :: mem(n,nb) !memory for a temporary matrix of size n x nb real(kind=REAL_DATATYPE) :: mem2(nb,nb) !memory for a temporary matrix of size nb x nb #ifdef HAVE_DETAILED_TIMINGS call timer%start("wy_symm" + M_PRECISSION_SUFFIX) #endif call M_PRECISSION_SYMM('L', 'L', n, nb, M_CONST_1_0, A, lda, W, lda2, M_CONST_0_0, mem, n) call M_PRECISSION_GEMM('T', 'N', nb, nb, n, M_CONST_1_0, mem, n, W, lda2, M_CONST_0_0, mem2, nb) call M_PRECISSION_GEMM('N', 'N', n, nb, nb, -M_CONST_0_5, Y, lda2, mem2, nb, M_CONST_1_0, mem, n) call M_PRECISSION_SYR2K('L', 'N', n, nb, -M_CONST_1_0, Y, lda2, mem, n, M_CONST_1_0, A, lda) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("wy_symm" + M_PRECISSION_SUFFIX) #endif end subroutine