#if 0 ! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! This particular source code file contains additions, changes and ! enhancements authored by Intel Corporation which is not part of ! the ELPA consortium. ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". #endif #include "../general/sanity.F90" #if REALCASE == 1 subroutine tridiag_band_real_& #endif #if COMPLEXCASE == 1 subroutine tridiag_band_complex_& #endif &PRECISION & (obj, na, nb, nblk, aMatrix, a_dev, lda, d, e, matrixCols, & hh_trans, mpi_comm_rows, mpi_comm_cols, communicator, useGPU, wantDebug) !------------------------------------------------------------------------------- ! tridiag_band_real/complex: ! Reduces a real symmetric band matrix to tridiagonal form ! ! na Order of matrix a ! ! nb Semi bandwith ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! aMatrix(lda,matrixCols) Distributed system matrix reduced to banded form in the upper diagonal ! ! lda Leading dimension of a ! matrixCols local columns of matrix a ! ! hh_trans : housholder vectors ! ! d(na) Diagonal of tridiagonal matrix, set only on PE 0 (output) ! ! e(na) Subdiagonal of tridiagonal matrix, set only on PE 0 (output) ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns ! communicator ! MPI-Communicator for the total processor set !------------------------------------------------------------------------------- use elpa_abstract_impl use elpa2_workload use precision use iso_c_binding use redist implicit none #include "../general/precision_kinds.F90" class(elpa_abstract_impl_t), intent(inout) :: obj logical, intent(in) :: useGPU, wantDebug integer(kind=ik), intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator #if REALCASE == 1 #ifdef USE_ASSUMED_SIZE real(kind=REAL_DATATYPE), intent(in) :: aMatrix(lda,*) #else real(kind=REAL_DATATYPE), intent(in) :: aMatrix(lda,matrixCols) #endif #endif /* REALCASE */ #if COMPLEXCASE == 1 #ifdef USE_ASSUMED_SIZE complex(kind=COMPLEX_DATATYPE),intent(in) :: aMatrix(lda,*) #else complex(kind=COMPLEX_DATATYPE), intent(in) :: aMatrix(lda,matrixCols) #endif #endif /* COMPLEXCASE */ integer(kind=c_intptr_t) :: a_dev real(kind=REAL_DATATYPE), intent(out) :: d(na), e(na) ! set only on PE 0 MATH_DATATYPE(kind=rck), intent(out), allocatable :: hh_trans(:,:) real(kind=REAL_DATATYPE) :: vnorm2 #if REALCASE == 1 real(kind=REAL_DATATYPE) :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf real(kind=REAL_DATATYPE) :: hd(nb), hs(nb) #endif #if COMPLEXCASE == 1 complex(kind=COMPLEX_DATATYPE) :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf complex(kind=COMPLEX_DATATYPE) :: hd(nb), hs(nb) #endif integer(kind=ik) :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt integer(kind=ik) :: my_pe, n_pes, mpierr integer(kind=ik) :: my_prow, np_rows, my_pcol, np_cols integer(kind=ik) :: ireq_ab, ireq_hv integer(kind=ik) :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off #ifdef WITH_OPENMP integer(kind=ik) :: max_threads, my_thread, my_block_s, my_block_e, iter #ifdef WITH_MPI ! integer(kind=ik) :: my_mpi_status(MPI_STATUS_SIZE) #endif ! integer(kind=ik), allocatable :: mpi_statuses(:,:), global_id_tmp(:,:) integer(kind=ik), allocatable :: global_id_tmp(:,:) integer(kind=ik), allocatable :: omp_block_limits(:) #if REALCASE == 1 real(kind=REAL_DATATYPE), allocatable :: hv_t(:,:), tau_t(:) #endif #if COMPLEXCASE == 1 complex(kind=COMPLEX_DATATYPE), allocatable :: hv_t(:,:), tau_t(:) #endif integer(kind=ik) :: omp_get_max_threads #endif /* WITH_OPENMP */ integer(kind=ik), allocatable :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:) integer(kind=ik), allocatable :: limits(:), snd_limits(:,:) integer(kind=ik), allocatable :: block_limits(:) #if REALCASE == 1 real(kind=REAL_DATATYPE), allocatable :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:) #endif #if COMPLEXCASE == 1 complex(kind=COMPLEX_DATATYPE), allocatable :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:) #endif integer :: istat character(200) :: errorMessage #ifndef WITH_MPI integer(kind=ik) :: startAddr #endif call obj%timer%start("tridiag_band_& &MATH_DATATYPE& &" // & &PRECISION_SUFFIX & ) if (wantDebug) call obj%timer%start("mpi_communication") call mpi_comm_rank(communicator,my_pe,mpierr) call mpi_comm_size(communicator,n_pes,mpierr) call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) if (wantDebug) call obj%timer%stop("mpi_communication") ! Get global_id mapping 2D procssor coordinates to global id allocate(global_id(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYPE& &: error when allocating global_id "//errorMessage stop 1 endif global_id(:,:) = 0 global_id(my_prow, my_pcol) = my_pe #ifdef WITH_OPENMP allocate(global_id_tmp(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYPE& &: error when allocating global_id_tmp "//errorMessage stop 1 endif #endif #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") #ifndef WITH_OPENMP call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, communicator, mpierr) #else global_id_tmp(:,:) = global_id(:,:) call mpi_allreduce(global_id_tmp, global_id, np_rows*np_cols, mpi_integer, mpi_sum, communicator, mpierr) deallocate(global_id_tmp, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYPE& &: error when deallocating global_id_tmp "//errorMessage stop 1 endif #endif /* WITH_OPENMP */ if (wantDebug) call obj%timer%stop("mpi_communication") #endif /* WITH_MPI */ ! Total number of blocks in the band: nblocks_total = (na-1)/nb + 1 ! Set work distribution allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYPE& &: error when allocating block_limits"//errorMessage stop 1 endif call divide_band(obj,nblocks_total, n_pes, block_limits) ! nblocks: the number of blocks for my task nblocks = block_limits(my_pe+1) - block_limits(my_pe) ! allocate the part of the band matrix which is needed by this PE ! The size is 1 block larger than needed to avoid extensive shifts allocate(ab(2*nb,(nblocks+1)*nb), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYPE& &: error when allocating ab"//errorMessage stop 1 endif ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety ! n_off: Offset of ab within band n_off = block_limits(my_pe)*nb ! Redistribute band in a to ab call redist_band_& &MATH_DATATYPE& &_& &PRECISION& &(obj,aMatrix, a_dev, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator, ab, useGPU) ! Calculate the workload for each sweep in the back transformation ! and the space requirements to hold the HH vectors allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYPE& &: error when allocating limits"//errorMessage stop 1 endif call determine_workload(obj,na, nb, np_rows, limits) max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1)) num_hh_vecs = 0 num_chunks = 0 nx = na do n = 1, nblocks_total call determine_workload(obj, nx, nb, np_rows, limits) local_size = limits(my_prow+1) - limits(my_prow) ! add to number of householder vectors ! please note: for nx==1 the one and only HH Vector is 0 and is neither calculated nor send below! if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then num_hh_vecs = num_hh_vecs + local_size num_chunks = num_chunks+1 endif nx = nx - nb enddo ! Allocate space for HH vectors allocate(hh_trans(nb,num_hh_vecs), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then #if REALCASE == 1 print *,"tridiag_band_real: error when allocating hh_trans"//errorMessage #endif #if COMPLEXCASE == 1 print *,"tridiag_band_complex: error when allocating hh_trans "//errorMessage #endif stop 1 endif ! Allocate and init MPI requests allocate(ireq_hhr(num_chunks), stat=istat, errmsg=errorMessage) ! Recv requests if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYPE& &: error when allocating ireq_hhr"//errorMessage stop 1 endif allocate(ireq_hhs(nblocks), stat=istat, errmsg=errorMessage) ! Send requests if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYEP& &: error when allocating ireq_hhs"//errorMessage stop 1 endif num_hh_vecs = 0 num_chunks = 0 nx = na nt = 0 do n = 1, nblocks_total call determine_workload(obj,nx, nb, np_rows, limits) local_size = limits(my_prow+1) - limits(my_prow) if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then num_chunks = num_chunks+1 #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") call mpi_irecv(hh_trans(1,num_hh_vecs+1), & nb*local_size, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif nt, 10+n-block_limits(nt), communicator, ireq_hhr(num_chunks), mpierr) if (wantDebug) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ ! carefull non-block recv data copy must be done at wait or send ! hh_trans(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk) #endif /* WITH_MPI */ num_hh_vecs = num_hh_vecs + local_size endif nx = nx - nb if (n == block_limits(nt+1)) then nt = nt + 1 endif enddo #ifdef WITH_MPI ireq_hhs(:) = MPI_REQUEST_NULL #endif ! Buffers for gathering/sending the HH vectors allocate(hh_gath(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! gathers HH vectors if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYPE& &: error when allocating hh_gath"//errorMessage stop 1 endif allocate(hh_send(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! send buffer for HH vectors if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYPE& &: error when allocating hh_send"//errorMessage stop 1 endif #if REALCASE == 1 hh_gath(:,:,:) = CONST_0_0 hh_send(:,:,:) = CONST_0_0 #endif #if COMPLEXCASE == 1 hh_gath(:,:,:) = CONST_COMPLEX_0_0 hh_send(:,:,:) = CONST_COMPLEX_0_0 #endif ! Some counters allocate(hh_cnt(nblocks), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYPE& &: error when allocating hh_cnt"//errorMessage stop 1 endif allocate(hh_dst(nblocks), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYPE& &: error when allocating hh_dst"//errorMessage stop 1 endif hh_cnt(:) = 1 ! The first transfomation Vector is always 0 and not calculated at all hh_dst(:) = 0 ! PE number for receive #ifdef WITH_MPI ireq_ab = MPI_REQUEST_NULL ireq_hv = MPI_REQUEST_NULL #endif ! Limits for sending allocate(snd_limits(0:np_rows,nblocks), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYPE& &: error when allocating snd_limits"//errorMessage stop 1 endif do iblk=1,nblocks call determine_workload(obj, na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk)) enddo #ifdef WITH_OPENMP ! OpenMP work distribution: max_threads = 1 #if REALCASE == 1 max_threads = omp_get_max_threads() #endif #if COMPLEXCASE == 1 !$ max_threads = omp_get_max_threads() #endif ! For OpenMP we need at least 2 blocks for every thread max_threads = MIN(max_threads, nblocks/2) if (max_threads==0) max_threads = 1 allocate(omp_block_limits(0:max_threads), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYPE& &: error when allocating omp_block_limits"//errorMessage stop 1 endif ! Get the OpenMP block limits call divide_band(obj,nblocks, max_threads, omp_block_limits) allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYPE& &: error when allocating hv_t, tau_t"//errorMessage stop 1 endif #if REALCASE == 1 hv_t = 0 tau_t = 0 #endif #if COMPLEXCASE == 1 hv_t = CONST_COMPLEX_0_0 tau_t = CONST_COMPLEX_0_0 #endif #endif /* WITH_OPENMP */ ! --------------------------------------------------------------------------- ! Start of calculations na_s = block_limits(my_pe)*nb + 1 if (my_pe>0 .and. na_s<=na) then ! send first column to previous PE ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also) ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off) #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") call mpi_isend(ab_s, nb+1, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_pe-1, 1, communicator, ireq_ab, mpierr) if (wantDebug) call obj%timer%stop("mpi_communication") #endif /* WITH_MPI */ endif #ifndef WITH_MPI startAddr = ubound(hh_trans,dim=2) #endif /* WITH_MPI */ #ifdef WITH_OPENMP do istep=1,na-1-block_limits(my_pe)*nb #else do istep=1,na-1 #endif if (my_pe==0) then n = MIN(na-na_s,nb) ! number of rows to be reduced #if REALCASE == 1 hv(:) = CONST_0_0 tau = CONST_0_0 #endif #if COMPLEXCASE == 1 hv(:) = CONST_COMPLEX_0_0 tau = CONST_COMPLEX_0_0 #endif ! Transform first column of remaining matrix #if REALCASE == 1 ! The last step (istep=na-1) is only needed for sending the last HH vectors. ! We don't want the sign of the last element flipped (analogous to the other sweeps) #endif #if COMPLEXCASE == 1 ! Opposed to the real case, the last step (istep=na-1) is needed here for making ! the last subdiagonal element a real number #endif #if REALCASE == 1 if (istep < na-1) then ! Transform first column of remaining matrix vnorm2 = sum(ab(3:n+1,na_s-n_off)**2) #endif #if COMPLEXCASE == 1 #ifdef DOUBLE_PRECISION_COMPLEX vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk8)**2+dimag(ab(3:n+1,na_s-n_off))**2) #else vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk4)**2+aimag(ab(3:n+1,na_s-n_off))**2) #endif if (n<2) vnorm2 = 0. ! Safety only #endif /* COMPLEXCASE */ #if REALCASE == 1 call hh_transform_real_& #endif #if COMPLEXCASE == 1 call hh_transform_complex_& #endif &PRECISION & (obj, ab(2,na_s-n_off), vnorm2, hf, tau, wantDebug) #if REALCASE == 1 hv(1) = CONST_1_0 hv(2:n) = ab(3:n+1,na_s-n_off)*hf endif #endif #if COMPLEXCASE == 1 hv(1) = CONST_COMPLEX_1_0 hv(2:n) = ab(3:n+1,na_s-n_off)*hf #endif d(istep) = ab(1,na_s-n_off) e(istep) = ab(2,na_s-n_off) if (istep == na-1) then d(na) = ab(1,na_s+1-n_off) #if REALCASE == 1 e(na) = CONST_0_0 #endif #if COMPLEXCASE == 1 e(na) = CONST_REAL_0_0 #endif endif else if (na>na_s) then ! Receive Householder Vector from previous task, from PE owning subdiagonal #ifdef WITH_OPENMP #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") call mpi_recv(hv, nb, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_pe-1, 2, communicator, MPI_STATUS_IGNORE, mpierr) if (wantDebug) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ hv(1:nb) = hv_s(1:nb) #endif /* WITH_MPI */ #else /* WITH_OPENMP */ #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") call mpi_recv(hv, nb, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_pe-1, 2, communicator, MPI_STATUS_IGNORE, mpierr) if (wantDebug) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ hv(1:nb) = hv_s(1:nb) #endif /* WITH_MPI */ #endif /* WITH_OPENMP */ tau = hv(1) #if REALCASE == 1 hv(1) = CONST_1_0 #endif #if COMPLEXCASE == 1 hv(1) = CONST_COMPLEX_1_0 #endif endif endif na_s = na_s+1 if (na_s-n_off > nb) then ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb) #if REALCASE == 1 ab(:,nblocks*nb+1:(nblocks+1)*nb) = CONST_0_0 #endif #if COMPLEXCASE == 1 ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0 #endif n_off = n_off + nb endif #ifdef WITH_OPENMP if (max_threads > 1) then ! Codepath for OpenMP ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread! ! Every thread is one reduction cycle behind its predecessor and thus starts one step later. ! This simulates the behaviour of the MPI tasks which also work after each other. ! The code would be considerably easier, if the MPI communication would be made within ! the parallel region - this is avoided here since this would require ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program. hv_t(:,1) = hv tau_t(1) = tau do iter = 1, 2 ! iter=1 : work on first block ! iter=2 : work on remaining blocks ! This is done in 2 iterations so that we have a barrier in between: ! After the first iteration, it is guaranteed that the last row of the last block ! is completed by the next thread. ! After the first iteration it is also the place to exchange the last row ! with MPI calls call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX) !$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, & !$omp& nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads) do my_thread = 1, max_threads if (iter == 1) then my_block_s = omp_block_limits(my_thread-1) + 1 my_block_e = my_block_s else my_block_s = omp_block_limits(my_thread-1) + 2 my_block_e = omp_block_limits(my_thread) endif do iblk = my_block_s, my_block_e ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block ne = ns+nb-1 ! last column in block if (istepna) exit hv = hv_t(:,my_thread) tau = tau_t(my_thread) ! Store Householder Vector for back transformation hh_cnt(iblk) = hh_cnt(iblk) + 1 hh_gath(1 ,hh_cnt(iblk),iblk) = tau hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb) nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) ! Note that nr>=0 implies that diagonal block is full (nc==nb)! ! Transform diagonal block if (wantDebug) call obj%timer%start("blas") #if REALCASE == 1 call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, CONST_0_0, hd, 1) #endif #if COMPLEXCASE == 1 call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, CONST_COMPLEX_PAIR_0_0, hd, 1) #endif if (wantDebug) call obj%timer%stop("blas") #if REALCASE == 1 x = dot_product(hv(1:nc),hd(1:nc))*tau hd(1:nc) = hd(1:nc) - CONST_0_5*x*hv(1:nc) #endif #if COMPLEXCASE == 1 x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau) hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc) #endif if (wantDebug) call obj%timer%start("blas") #if REALCASE == 1 call PRECISION_SYR2('L', nc, -CONST_1_0 ,hd, 1, hv, 1, ab(1,ns), 2*nb-1) #endif #if COMPLEXCASE == 1 call PRECISION_HER2('L', nc, CONST_COMPLEX_PAIR_NEGATIVE_1_0, hd, 1, hv, 1, ab(1,ns), 2*nb-1) #endif if (wantDebug) call obj%timer%stop("blas") #if REALCASE == 1 hv_t(:,my_thread) = CONST_0_0 tau_t(my_thread) = CONST_0_0 #endif #if COMPLEXCASE == 1 hv_t(:,my_thread) = CONST_COMPLEX_0_0 tau_t(my_thread) = CONST_COMPLEX_0_0 #endif if (nr<=0) cycle ! No subdiagonal block present any more ! Transform subdiagonal block if (wantDebug) call obj%timer%start("blas") call PRECISION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, & #if REALCASE == 1 CONST_0_0, & #endif #if COMPLEXCASE == 1 CONST_COMPLEX_PAIR_0_0, & #endif hs, 1) if (wantDebug) call obj%timer%stop("blas") if (nr>1) then ! complete (old) Householder transformation for first column ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1 ! calculate new Householder transformation for first column ! (stored in hv_t(:,my_thread) and tau_t(my_thread)) #if REALCASE == 1 vnorm2 = sum(ab(nb+2:nb+nr,ns)**2) #endif #if COMPLEXCASE == 1 #ifdef DOUBLE_PRECISION_COMPLEX vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2) #else vnorm2 = sum(real(ab(nb+2:nb+nr,ns))**2+aimag(ab(nb+2:nb+nr,ns))**2) #endif #endif /* COMPLEXCASE */ #if REALCASE == 1 call hh_transform_real_& #endif #if COMPLEXCASE == 1 call hh_transform_complex_& #endif &PRECISION & (obj, ab(nb+1,ns), vnorm2, hf, tau_t(my_thread), wantDebug) #if REALCASE == 1 hv_t(1 ,my_thread) = CONST_1_0 #endif #if COMPLEXCASE == 1 hv_t(1 ,my_thread) = CONST_COMPLEX_1_0 #endif hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf #if REALCASE == 1 ab(nb+2:,ns) = CONST_0_0 #endif #if COMPLEXCASE == 1 ab(nb+2:,ns) = CONST_COMPLEX_0_0 #endif ! update subdiagonal block for old and new Householder transformation ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster if (wantDebug) call obj%timer%start("blas") #if REALCASE == 1 call PRECISION_GEMV('T', & #endif #if COMPLEXCASE == 1 call PRECISION_GEMV('C', & #endif nr, nb-1, tau_t(my_thread), ab(nb,ns+1), 2*nb-1, hv_t(1,my_thread), 1, & #if REALCASE == 1 CONST_0_0, & #endif #if COMPLEXCASE == 1 CONST_COMPLEX_PAIR_0_0, & #endif h(2), 1) if (wantDebug) call obj%timer%stop("blas") x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread) h(2:nb) = h(2:nb) - x*hv(2:nb) ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2") do i=2,nb ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_t(1:nr,my_thread)* & #if REALCASE == 1 h(i) - hs(1:nr)*hv(i) #endif #if COMPLEXCASE == 1 conjg(h(i)) - hs(1:nr)*conjg(hv(i)) #endif enddo else ! No new Householder transformation for nr=1, just complete the old one ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1 do i=2,nb #if REALCASE == 1 ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i) #endif #if COMPLEXCASE == 1 ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i)) #endif enddo ! For safety: there is one remaining dummy transformation (but tau is 0 anyways) #if REALCASE == 1 hv_t(1,my_thread) = CONST_1_0 #endif #if COMPLEXCASE == 1 hv_t(1,my_thread) = CONST_COMPLEX_1_0 #endif endif enddo enddo ! my_thread !$omp end parallel do call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX) if (iter==1) then ! We are at the end of the first block ! Send our first column to previous PE if (my_pe>0 .and. na_s <= na) then #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") call mpi_wait(ireq_ab, MPI_STATUS_IGNORE, mpierr) if (wantDebug) call obj%timer%stop("mpi_communication") #endif ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off) #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") call mpi_isend(ab_s, nb+1, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_pe-1, 1, communicator, ireq_ab, mpierr) if (wantDebug) call obj%timer%stop("mpi_communication") #endif /* WITH_MPI */ endif ! Request last column from next PE ne = na_s + nblocks*nb - (max_threads-1) - 1 #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") if (istep>=max_threads .and. ne <= na) then call mpi_recv(ab(1,ne-n_off), nb+1, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr) endif if (wantDebug) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ if (istep>=max_threads .and. ne <= na) then ab(1:nb+1,ne-n_off) = ab_s(1:nb+1) endif #endif /* WITH_MPI */ else ! We are at the end of all blocks ! Send last HH Vector and TAU to next PE if it has been calculated above ne = na_s + nblocks*nb - (max_threads-1) - 1 if (istep>=max_threads .and. ne < na) then #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") call mpi_wait(ireq_hv, MPI_STATUS_IGNORE, mpierr) if (wantDebug) call obj%timer%stop("mpi_communication") #endif hv_s(1) = tau_t(max_threads) hv_s(2:) = hv_t(2:,max_threads) #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") call mpi_isend(hv_s, nb, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_pe+1, 2, communicator, ireq_hv, mpierr) if (wantDebug) call obj%timer%stop("mpi_communication") #endif /* WITH_MPI */ endif ! "Send" HH Vector and TAU to next OpenMP thread do my_thread = max_threads, 2, -1 hv_t(:,my_thread) = hv_t(:,my_thread-1) tau_t(my_thread) = tau_t(my_thread-1) enddo endif enddo ! iter else ! Codepath for 1 thread without OpenMP ! The following code is structured in a way to keep waiting times for ! other PEs at a minimum, especially if there is only one block. ! For this reason, it requests the last column as late as possible ! and sends the Householder Vector and the first column as early ! as possible. #endif /* WITH_OPENMP */ do iblk=1,nblocks ns = na_s + (iblk-1)*nb - n_off ! first column in block ne = ns+nb-1 ! last column in block if (ns+n_off>na) exit ! Store Householder Vector for back transformation hh_cnt(iblk) = hh_cnt(iblk) + 1 hh_gath(1 ,hh_cnt(iblk),iblk) = tau hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb) #ifndef WITH_OPENMP if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then ! Wait for last transfer to finish #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr) if (wantDebug) call obj%timer%stop("mpi_communication") #endif ! Copy vectors into send buffer hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk) ! Send to destination #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1,np_cols)), & 10+iblk, communicator, ireq_hhs(iblk), mpierr) if (wantDebug) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ ! do the post-poned irecv here startAddr = startAddr - hh_cnt(iblk) hh_trans(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk) #endif /* WITH_MPI */ ! Reset counter and increase destination row #if REALCASE == 1 hh_cnt(iblk) = CONST_0_0 #endif #if COMPLEXCASE == 1 hh_cnt(iblk) = 0 #endif hh_dst(iblk) = hh_dst(iblk)+1 endif ! The following code is structured in a way to keep waiting times for ! other PEs at a minimum, especially if there is only one block. ! For this reason, it requests the last column as late as possible ! and sends the Householder Vector and the first column as early ! as possible. #endif /* WITH_OPENMP */ nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) ! Note that nr>=0 implies that diagonal block is full (nc==nb)! ! Multiply diagonal block and subdiagonal block with Householder Vector if (iblk==nblocks .and. nc==nb) then ! We need the last column from the next PE. ! First do the matrix multiplications without last column ... ! Diagonal block, the contribution of the last element is added below! #if REALCASE == 1 ab(1,ne) = CONST_0_0 #endif #if COMPLEXCASE == 1 ab(1,ne) = CONST_COMPLEX_0_0 #endif if (wantDebug) call obj%timer%start("blas") #if REALCASE == 1 call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, CONST_0_0, hd, 1) #endif #if COMPLEXCASE == 1 call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1,CONST_COMPLEX_PAIR_0_0,hd,1) #endif ! Subdiagonal block if (nr>0) call PRECISION_GEMV('N', nr, nb-1, tau, ab(nb+1,ns), 2*nb-1, hv, 1, & #if REALCASE == 1 CONST_0_0, hs, 1) #endif #if COMPLEXCASE == 1 CONST_COMPLEX_PAIR_0_0,hs,1) #endif if (wantDebug) call obj%timer%stop("blas") ! ... then request last column ... #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") #ifdef WITH_OPENMP call mpi_recv(ab(1,ne), nb+1, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr) #else /* WITH_OPENMP */ call mpi_recv(ab(1,ne), nb+1, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_pe+1, 1, communicator, MPI_STATUS_IGNORE, mpierr) #endif /* WITH_OPENMP */ if (wantDebug) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ ab(1:nb+1,ne) = ab_s(1:nb+1) #endif /* WITH_MPI */ ! ... and complete the result hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb) hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau else ! Normal matrix multiply if (wantDebug) call obj%timer%start("blas") #if REALCASE == 1 call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, CONST_0_0, hd, 1) #endif #if COMPLEXCASE == 1 call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, CONST_COMPLEX_PAIR_0_0, hd, 1) #endif if (nr>0) call PRECISION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, & #if REALCASE == 1 CONST_0_0, hs, 1) #endif #if COMPLEXCASE == 1 CONST_COMPLEX_PAIR_0_0, hs, 1) #endif if (wantDebug) call obj%timer%stop("blas") endif ! Calculate first column of subdiagonal block and calculate new ! Householder transformation for this column #if REALCASE == 1 hv_new(:) = CONST_0_0 ! Needed, last rows must be 0 for nr < nb tau_new = CONST_0_0 #endif #if COMPLEXCASE == 1 hv_new(:) = 0 ! Needed, last rows must be 0 for nr < nb tau_new = 0 #endif if (nr>0) then ! complete (old) Householder transformation for first column ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1 ! calculate new Householder transformation ... if (nr>1) then #if REALCASE == 1 vnorm2 = sum(ab(nb+2:nb+nr,ns)**2) #endif #if COMPLEXCASE == 1 #ifdef DOUBLE_PRECISION_COMPLEX vnorm2 = sum(real(ab(nb+2:nb+nr,ns),kind=rk8)**2+dimag(ab(nb+2:nb+nr,ns))**2) #else vnorm2 = sum(real(ab(nb+2:nb+nr,ns),kind=rk4)**2+aimag(ab(nb+2:nb+nr,ns))**2) #endif #endif /* COMPLEXCASE */ #if REALCASE == 1 call hh_transform_real_& #endif #if COMPLEXCASE == 1 call hh_transform_complex_& #endif &PRECISION & (obj, ab(nb+1,ns), vnorm2, hf, tau_new, wantDebug) #if REALCASE == 1 hv_new(1) = CONST_1_0 #endif #if COMPLEXCASE == 1 hv_new(1) = CONST_COMPLEX_1_0 #endif hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf #if REALCASE == 1 ab(nb+2:,ns) = CONST_0_0 #endif #if COMPLEXCASE == 1 ab(nb+2:,ns) = CONST_COMPLEX_0_0 #endif endif ! nr > 1 ! ... and send it away immediatly if this is the last block if (iblk==nblocks) then #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") #ifdef WITH_OPENMP call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) #else call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) #endif if (wantDebug) call obj%timer%stop("mpi_communication") #endif /* WITH_MPI */ hv_s(1) = tau_new hv_s(2:) = hv_new(2:) #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") call mpi_isend(hv_s, nb, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_pe+1, 2, communicator, ireq_hv, mpierr) if (wantDebug) call obj%timer%stop("mpi_communication") #endif /* WITH_MPI */ endif endif ! Transform diagonal block #if REALCASE == 1 x = dot_product(hv(1:nc),hd(1:nc))*tau hd(1:nc) = hd(1:nc) - CONST_0_5*x*hv(1:nc) #endif #if COMPLEXCASE == 1 x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau) hd(1:nc) = hd(1:nc) - CONST_REAL_0_5*x*hv(1:nc) #endif if (my_pe>0 .and. iblk==1) then ! The first column of the diagonal block has to be send to the previous PE ! Calculate first column only ... #if REALCASE == 1 ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*hv(1) - hv(1:nc)*hd(1) #endif #if COMPLEXCASE == 1 ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*conjg(hv(1)) - hv(1:nc)*conjg(hd(1)) #endif ! ... send it away ... #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") #ifdef WITH_OPENMP call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) #else call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) #endif if (wantDebug) call obj%timer%stop("mpi_communication") #endif /* WITH_MPI */ ab_s(1:nb+1) = ab(1:nb+1,ns) #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") call mpi_isend(ab_s, nb+1, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_pe-1, 1, communicator, ireq_ab, mpierr) if (wantDebug) call obj%timer%stop("mpi_communication") #endif /* WITH_MPI */ ! ... and calculate remaining columns with rank-2 update if (wantDebug) call obj%timer%start("blas") #if REALCASE == 1 if (nc>1) call PRECISION_SYR2('L', nc-1, -CONST_1_0, hd(2), 1, hv(2), 1, ab(1,ns+1), 2*nb-1) #endif #if COMPLEXCASE == 1 if (nc>1) call PRECISION_HER2('L', nc-1, CONST_COMPLEX_PAIR_NEGATIVE_1_0, hd(2), 1, hv(2), 1, ab(1,ns+1), 2*nb-1) #endif if (wantDebug) call obj%timer%stop("blas") else ! No need to send, just a rank-2 update if (wantDebug) call obj%timer%start("blas") #if REALCASE == 1 call PRECISION_SYR2('L', nc, -CONST_1_0, hd, 1, hv, 1, ab(1,ns), 2*nb-1) #endif #if COMPLEXCASE == 1 call PRECISION_HER2('L', nc, CONST_COMPLEX_PAIR_NEGATIVE_1_0, hd, 1, hv, 1, ab(1,ns), 2*nb-1) #endif if (wantDebug) call obj%timer%stop("blas") endif ! Do the remaining double Householder transformation on the subdiagonal block cols 2 ... nb if (nr>0) then if (nr>1) then if (wantDebug) call obj%timer%start("blas") #if REALCASE == 1 call PRECISION_GEMV('T', & #endif #if COMPLEXCASE == 1 call PRECISION_GEMV('C', & #endif nr, nb-1, tau_new, ab(nb,ns+1), 2*nb-1, hv_new, 1, & #if REALCASE == 1 CONST_0_0, h(2), 1) #endif #if COMPLEXCASE == 1 CONST_COMPLEX_PAIR_0_0, h(2), 1) #endif if (wantDebug) call obj%timer%stop("blas") x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new h(2:nb) = h(2:nb) - x*hv(2:nb) ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update do i=2,nb #if REALCASE == 1 ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*h(i) - hs(1:nr)*hv(i) #endif #if COMPLEXCASE == 1 ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*conjg(h(i)) - hs(1:nr)*conjg(hv(i)) #endif enddo else ! No double Householder transformation for nr=1, just complete the row do i=2,nb #if REALCASE == 1 ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i) #endif #if COMPLEXCASE == 1 ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i)) #endif enddo endif endif ! Use new HH Vector for the next block hv(:) = hv_new(:) tau = tau_new enddo #ifdef WITH_OPENMP endif #endif #if WITH_OPENMP do iblk = 1, nblocks if (hh_dst(iblk) >= np_rows) exit if (snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then ! Wait for last transfer to finish #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr) if (wantDebug) call obj%timer%stop("mpi_communication") #endif ! Copy vectors into send buffer hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk) ! Send to destination #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1, np_cols)), & 10+iblk, communicator, ireq_hhs(iblk), mpierr) if (wantDebug) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ ! do the post-poned irecv here startAddr = startAddr - hh_cnt(iblk) hh_trans(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk) #endif /* WITH_MPI */ ! Reset counter and increase destination row #if REALCASE == 1 hh_cnt(iblk) = CONST_0_0 #endif #if COMPLEXCASE == 1 hh_cnt(iblk) = 0 #endif hh_dst(iblk) = hh_dst(iblk)+1 endif enddo #endif /* WITH_OPENMP */ enddo ! istep ! Finish the last outstanding requests #ifdef WITH_OPENMP #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) ! allocate(mpi_statuses(MPI_STATUS_SIZE,max(nblocks,num_chunks)), stat=istat, errmsg=errorMessage) ! if (istat .ne. 0) then ! print *,"tridiag_band_real: error when allocating mpi_statuses"//errorMessage ! stop 1 ! endif call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr) call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr) ! deallocate(mpi_statuses, stat=istat, errmsg=errorMessage) ! if (istat .ne. 0) then ! print *,"tridiag_band_real: error when deallocating mpi_statuses"//errorMessage ! stop 1 ! endif if (wantDebug) call obj%timer%stop("mpi_communication") #endif /* WITH_MPI */ #else /* WITH_OPENMP */ #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr) call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr) if (wantDebug) call obj%timer%stop("mpi_communication") #endif #endif /* WITH_OPENMP */ #ifdef WITH_MPI if (wantDebug) call obj%timer%start("mpi_communication") call mpi_barrier(communicator,mpierr) if (wantDebug) call obj%timer%stop("mpi_communication") #endif deallocate(ab, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYPE& &: error when deallocating ab"//errorMessage stop 1 endif deallocate(ireq_hhr, ireq_hhs, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYPE& &: error when deallocating ireq_hhr, ireq_hhs"//errorMessage stop 1 endif deallocate(hh_cnt, hh_dst, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYPE& &: error when deallocating hh_cnt, hh_dst"//errorMessage stop 1 endif deallocate(hh_gath, hh_send, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYPE& &: error when deallocating hh_gath, hh_send"//errorMessage stop 1 endif deallocate(limits, snd_limits, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYPE& &: error when deallocating limits, send_limits"//errorMessage stop 1 endif deallocate(block_limits, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYPE& &: error when deallocating block_limits"//errorMessage stop 1 endif deallocate(global_id, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_& &MATH_DATATYPE& &: error when allocating global_id"//errorMessage stop 1 endif call obj%timer%stop("tridiag_band_& &MATH_DATATYPE& &" // & &PRECISION_SUFFIX& ) ! intel compiler bug makes these ifdefs necessary #if REALCASE == 1 end subroutine tridiag_band_real_& #endif #if COMPLEXCASE == 1 end subroutine tridiag_band_complex_& #endif &PRECISION