subroutine tridiag_band_complex_PRECISION(na, nb, nblk, a, lda, d, e, matrixCols, hh_trans_complex, & mpi_comm_rows, mpi_comm_cols, mpi_comm) !------------------------------------------------------------------------------- ! tridiag_band_complex: ! Reduces a complex hermitian symmetric band matrix to tridiagonal form ! ! na Order of matrix a ! ! nb Semi bandwith ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! a(lda,matrixCols) Distributed system matrix reduced to banded form in the upper diagonal ! ! lda Leading dimension of a ! matrixCols local columns of matrix a ! ! d(na) Diagonal of tridiagonal matrix, set only on PE 0 (output) ! ! e(na) Subdiagonal of tridiagonal matrix, set only on PE 0 (output) ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns ! mpi_comm ! MPI-Communicator for the total processor set !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #else use timings_dummy #endif use elpa2_workload use precision implicit none !#ifdef WITH_GPU_VERSION ! integer(C_SIZE_T) :: h_dev, hv_new_dev ,ab_dev,x_dev,hs_dev,tau_new_dev,hv_dev,hd_dev ! complex*16, allocatable :: ab_temp(:,:) !#endif integer(kind=ik), intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm #ifdef USE_ASSUMED_SIZE complex(kind=COMPLEX_DATATYPE),intent(in) :: a(lda,*) #else complex(kind=COMPLEX_DATATYPE), intent(in) :: a(lda,matrixCols) #endif real(kind=REAL_DATATYPE), intent(out) :: d(na), e(na) ! set only on PE 0 complex(kind=COMPLEX_DATATYPE), intent(inout), & allocatable :: hh_trans_complex(:,:) real(kind=REAL_DATATYPE) :: vnorm2 complex(kind=COMPLEX_DATATYPE) :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf complex(kind=COMPLEX_DATATYPE) :: hd(nb), hs(nb) integer(kind=ik) :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt integer(kind=ik) :: my_pe, n_pes, mpierr integer(kind=ik) :: my_prow, np_rows, my_pcol, np_cols integer(kind=ik) :: ireq_ab, ireq_hv integer(kind=ik) :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off #ifdef WITH_OPENMP ! integer(kind=ik), allocatable :: mpi_statuses(:,:) integer(kind=ik), allocatable :: omp_block_limits(:) integer(kind=ik) :: max_threads, my_thread, my_block_s, my_block_e, iter integer(kind=ik) :: omp_get_max_threads #ifdef WITH_MPI ! integer(kind=ik) :: my_mpi_status(MPI_STATUS_SIZE) #endif complex(kind=COMPLEX_DATATYPE), allocatable :: hv_t(:,:), tau_t(:) #endif integer(kind=ik), allocatable :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:) integer(kind=ik), allocatable :: limits(:), snd_limits(:,:) integer(kind=ik), allocatable :: block_limits(:) complex(kind=COMPLEX_DATATYPE), allocatable :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:) integer(kind=ik) :: istat character(200) :: errorMessage #ifndef WITH_MPI integer(kind=ik) :: startAddr #endif ! ! dummies for calling redist_band ! real*8 :: r_a(1,1), r_ab(1,1) call timer%start("tridiag_band_complex_PRECISION") call timer%start("mpi_communication") call mpi_comm_rank(mpi_comm,my_pe,mpierr) call mpi_comm_size(mpi_comm,n_pes,mpierr) call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) call timer%stop("mpi_communication") !#ifdef WITH_GPU_VERSION ! t_1 = 0 ! t_2 = 0 !#endif ! Get global_id mapping 2D procssor coordinates to global id allocate(global_id(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating global_id "//errorMessage stop endif global_id(:,:) = 0 global_id(my_prow, my_pcol) = my_pe #ifdef WITH_MPI call timer%start("mpi_communication") call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr) call timer%stop("mpi_communication") #endif ! Total number of blocks in the band: nblocks_total = (na-1)/nb + 1 ! Set work distribution allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating block_limits "//errorMessage stop endif call divide_band(nblocks_total, n_pes, block_limits) ! nblocks: the number of blocks for my task nblocks = block_limits(my_pe+1) - block_limits(my_pe) ! allocate the part of the band matrix which is needed by this PE ! The size is 1 block larger than needed to avoid extensive shifts allocate(ab(2*nb,(nblocks+1)*nb), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating ab "//errorMessage stop endif !#ifdef WITH_GPU_VERSION ! allocate(ab_temp(2*nb,nblocks*nb), stat=istat, errmsg=errorMessage) ! if (istat .ne. 0) then ! print *,"error when allocating ab_temp "//errorMessage ! stop ! endif !#endif ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety !#ifdef WITH_GPU_VERSION ! ! istat = cuda_malloc(ab_dev, 2*nb*(nblocks+1)*nb*size_of_complex_datatype) ! if (istat .ne. 0) print *, " cuda malloc failed ab_dev", istat ! ! istat = cuda_malloc(hv_new_dev, nb*size_of_complex_datatype ) ! if (istat .ne. 0) print *, " cuda malloc failed hv_new_dev", istat ! !! istat = cuda_malloc(temp_c_dev, nb*nb*size_of_complex_datatype ) !! if(istat .ne. 0) print *, " cuda malloc failed temp_c", istat ! ! istat = cuda_malloc(h_dev , nb*size_of_complex_datatype) ! if (istat .ne. 0) print *, " cuda malloc failed h_dev", istat ! ! istat = cuda_malloc(hs_dev , nb*size_of_complex_datatype) ! if (istat .ne. 0) print *, " cuda malloc failed hs_dev", istat ! ! istat = cuda_malloc(x_dev , 1*size_of_complex_datatype) ! if (istat .ne. 0) print *, " cuda malloc failed x_dev", istat ! ! istat = cuda_malloc( tau_new_dev , 1*size_of_complex_datatype) ! if (istat .ne. 0) print *, " cuda malloc failed tau_new_dev", istat ! ! istat = cuda_malloc(hv_dev , nb*size_of_complex_datatype) ! if (istat .ne. 0) print *, " cuda malloc failed hv_dev", istat ! ! istat = cuda_malloc(hd_dev , nb*size_of_complex_datatype) ! if (istat .ne. 0) print *, " cuda malloc failed hd_dev", istat !#endif ! n_off: Offset of ab within band n_off = block_limits(my_pe)*nb ! Redistribute band in a to ab call redist_band_complex_PRECISION(a, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, ab) ! Calculate the workload for each sweep in the back transformation ! and the space requirements to hold the HH vectors allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating limits "//errorMessage stop endif call determine_workload(na, nb, np_rows, limits) max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1)) num_hh_vecs = 0 num_chunks = 0 nx = na do n = 1, nblocks_total call determine_workload(nx, nb, np_rows, limits) local_size = limits(my_prow+1) - limits(my_prow) ! add to number of householder vectors ! please note: for nx==1 the one and only HH vector is 0 and is neither calculated nor send below! if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then num_hh_vecs = num_hh_vecs + local_size num_chunks = num_chunks+1 endif nx = nx - nb enddo ! Allocate space for HH vectors allocate(hh_trans_complex(nb,num_hh_vecs), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating hh_trans_comples "//errorMessage stop endif ! Allocate and init MPI requests allocate(ireq_hhr(num_chunks), stat=istat, errmsg=errorMessage) ! Recv requests if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating ireq_hhr "//errorMessage stop endif allocate(ireq_hhs(nblocks), stat=istat, errmsg=errorMessage) ! Send requests if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating ireq_hhs "//errorMessage stop endif num_hh_vecs = 0 num_chunks = 0 nx = na nt = 0 do n = 1, nblocks_total call determine_workload(nx, nb, np_rows, limits) local_size = limits(my_prow+1) - limits(my_prow) if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then num_chunks = num_chunks+1 #ifdef WITH_MPI call timer%start("mpi_communication") call mpi_irecv(hh_trans_complex(1,num_hh_vecs+1), nb*local_size, MPI_COMPLEX_EXPLICIT_PRECISION, nt, & 10+n-block_limits(nt), mpi_comm, ireq_hhr(num_chunks), mpierr) call timer%stop("mpi_communication") #else /* WITH_MPI */ ! carefull non-block recv data copy must be done at wait or send ! hh_trans_complex(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk) #endif /* WITH_MPI */ num_hh_vecs = num_hh_vecs + local_size endif nx = nx - nb if (n == block_limits(nt+1)) then nt = nt + 1 endif enddo #ifdef WITH_MPI ireq_hhs(:) = MPI_REQUEST_NULL #endif ! Buffers for gathering/sending the HH vectors allocate(hh_gath(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! gathers HH vectors if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating hh_gath "//errorMessage stop endif allocate(hh_send(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! send buffer for HH vectors if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating hh_sebd "//errorMessage stop endif hh_gath(:,:,:) = CONST_COMPLEX_0_0 hh_send(:,:,:) = CONST_COMPLEX_0_0 ! Some counters allocate(hh_cnt(nblocks), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating hh_cnt "//errorMessage stop endif allocate(hh_dst(nblocks), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating hh_dst "//errorMessage stop endif hh_cnt(:) = 1 ! The first transfomation vector is always 0 and not calculated at all hh_dst(:) = 0 ! PE number for receive #ifdef WITH_MPI ireq_ab = MPI_REQUEST_NULL ireq_hv = MPI_REQUEST_NULL #endif ! Limits for sending allocate(snd_limits(0:np_rows,nblocks), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating snd_limits "//errorMessage stop endif do iblk=1,nblocks call determine_workload(na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk)) enddo #ifdef WITH_OPENMP ! OpenMP work distribution: max_threads = 1 !$ max_threads = omp_get_max_threads() ! For OpenMP we need at least 2 blocks for every thread max_threads = MIN(max_threads, nblocks/2) if (max_threads==0) max_threads = 1 allocate(omp_block_limits(0:max_threads), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating omp_block_limits "//errorMessage stop endif ! Get the OpenMP block limits call divide_band(nblocks, max_threads, omp_block_limits) allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when allocating hv_t, tau_t "//errorMessage stop endif hv_t = CONST_COMPLEX_0_0 tau_t = CONST_COMPLEX_0_0 #endif ! --------------------------------------------------------------------------- ! Start of calculations na_s = block_limits(my_pe)*nb + 1 if (my_pe>0 .and. na_s<=na) then ! send first column to previous PE ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also) ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off) #ifdef WITH_MPI call timer%start("mpi_communication") call mpi_isend(ab_s, nb+1, MPI_COMPLEX_EXPLICIT_PRECISION, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) call timer%stop("mpi_communication") #endif /* WITH_MPI */ endif #ifndef WITH_MPI startAddr = ubound(hh_trans_complex,dim=2) #endif #ifdef WITH_OPENMP do istep=1,na-1-block_limits(my_pe)*nb #else do istep=1,na-1 #endif if (my_pe==0) then n = MIN(na-na_s,nb) ! number of rows to be reduced hv(:) = CONST_COMPLEX_0_0 tau = CONST_COMPLEX_0_0 ! Transform first column of remaining matrix ! Opposed to the real case, the last step (istep=na-1) is needed here for making ! the last subdiagonal element a real number #ifdef DOUBLE_PRECISION_COMPLEX vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk8)**2+dimag(ab(3:n+1,na_s-n_off))**2) #else vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk4)**2+aimag(ab(3:n+1,na_s-n_off))**2) #endif if (n<2) vnorm2 = 0. ! Safety only call hh_transform_complex_PRECISION(ab(2,na_s-n_off),vnorm2,hf,tau) hv(1) = CONST_COMPLEX_1_0 hv(2:n) = ab(3:n+1,na_s-n_off)*hf d(istep) = ab(1,na_s-n_off) e(istep) = ab(2,na_s-n_off) if (istep == na-1) then d(na) = ab(1,na_s+1-n_off) e(na) = CONST_REAL_0_0 endif else if (na>na_s) then ! Receive Householder vector from previous task, from PE owning subdiagonal #ifdef WITH_OPENMP #ifdef WITH_MPI call timer%start("mpi_communication") call mpi_recv(hv, nb, MPI_COMPLEX_EXPLICIT_PRECISION, my_pe-1, 2, mpi_comm, MPI_STATUS_IGNORE, mpierr) call timer%stop("mpi_communication") #else /* WITH_MPI */ hv(1:nb) = hv_s(1:nb) #endif /* WITH_MPI */ #else /* WITH_OPENMP */ #ifdef WITH_MPI call timer%start("mpi_communication") call mpi_recv(hv, nb, MPI_COMPLEX_EXPLICIT_PRECISION, my_pe-1, 2, mpi_comm, MPI_STATUS_IGNORE, mpierr) call timer%stop("mpi_communication") #else /* WITH_MPI */ hv(1:nb) = hv_s(1:nb) #endif /* WITH_MPI */ #endif /* WITH_OPENMP */ tau = hv(1) hv(1) = CONST_COMPLEX_1_0 endif endif na_s = na_s+1 if (na_s-n_off > nb) then !#ifdef WITH_GPU_VERSION ! ab_temp(:,1:nblocks*nb) = ab(:,nb+1:(nblocks +1)*nb) ! ab(:, 1:nblocks*nb) = ab_temp(:, 1:nblocks*nb) !#else ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb) !#endif ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0 n_off = n_off + nb endif #ifdef WITH_OPENMP if (max_threads > 1) then ! Codepath for OpenMP ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread! ! Every thread is one reduction cycle behind its predecessor and thus starts one step later. ! This simulates the behaviour of the MPI tasks which also work after each other. ! The code would be considerably easier, if the MPI communication would be made within ! the parallel region - this is avoided here since this would require ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program. hv_t(:,1) = hv tau_t(1) = tau do iter = 1, 2 ! iter=1 : work on first block ! iter=2 : work on remaining blocks ! This is done in 2 iterations so that we have a barrier in between: ! After the first iteration, it is guaranteed that the last row of the last block ! is completed by the next thread. ! After the first iteration it is also the place to exchange the last row ! with MPI calls call timer%start("OpenMP parallel_double") !$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, & !$omp& nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads) do my_thread = 1, max_threads if (iter == 1) then my_block_s = omp_block_limits(my_thread-1) + 1 my_block_e = my_block_s else my_block_s = omp_block_limits(my_thread-1) + 2 my_block_e = omp_block_limits(my_thread) endif do iblk = my_block_s, my_block_e ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block ne = ns+nb-1 ! last column in block if (istepna) exit hv = hv_t(:,my_thread) tau = tau_t(my_thread) ! Store Householder vector for back transformation hh_cnt(iblk) = hh_cnt(iblk) + 1 hh_gath(1 ,hh_cnt(iblk),iblk) = tau hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb) nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) ! Note that nr>=0 implies that diagonal block is full (nc==nb)! ! Transform diagonal block call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, CONST_COMPLEX_PAIR_0_0, hd, 1) x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau) hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc) call PRECISION_HER2('L', nc, CONST_COMPLEX_PAIR_NEGATIVE_1_0, hd, 1, hv, 1, ab(1,ns), 2*nb-1) hv_t(:,my_thread) = CONST_COMPLEX_0_0 tau_t(my_thread) = CONST_COMPLEX_0_0 if (nr<=0) cycle ! No subdiagonal block present any more ! Transform subdiagonal block call PRECISION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, CONST_COMPLEX_PAIR_0_0, hs, 1) if (nr>1) then ! complete (old) Householder transformation for first column ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1 ! calculate new Householder transformation for first column ! (stored in hv_t(:,my_thread) and tau_t(my_thread)) #ifdef DOUBLE_PRECISION_COMPLEX vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2) #else vnorm2 = sum(real(ab(nb+2:nb+nr,ns))**2+aimag(ab(nb+2:nb+nr,ns))**2) #endif call hh_transform_complex_PRECISION(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread)) hv_t(1 ,my_thread) = CONST_COMPLEX_1_0 hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf ab(nb+2:,ns) = CONST_COMPLEX_0_0 ! update subdiagonal block for old and new Householder transformation ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster call PRECISION_GEMV('C', nr, nb-1, tau_t(my_thread), ab(nb,ns+1), 2*nb-1, hv_t(1,my_thread), & 1, CONST_COMPLEX_PAIR_0_0, h(2), 1) x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread) h(2:nb) = h(2:nb) - x*hv(2:nb) ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2") do i=2,nb ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) & - hv_t(1:nr,my_thread)*conjg(h(i)) - hs(1:nr)*conjg(hv(i)) enddo else ! No new Householder transformation for nr=1, just complete the old one ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1 do i=2,nb ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i)) enddo ! For safety: there is one remaining dummy transformation (but tau is 0 anyways) hv_t(1,my_thread) = CONST_COMPLEX_1_0 endif enddo enddo ! my_thread !$omp end parallel do call timer%stop("OpenMP parallel_double") if (iter==1) then ! We are at the end of the first block ! Send our first column to previous PE if (my_pe>0 .and. na_s <= na) then #ifdef WITH_MPI call timer%start("mpi_communication") call mpi_wait(ireq_ab, MPI_STATUS_IGNORE,mpierr) call timer%stop("mpi_communication") #endif ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off) #ifdef WITH_MPI call timer%start("mpi_communication") call mpi_isend(ab_s, nb+1, MPI_COMPLEX_EXPLICIT_PRECISION, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) call timer%stop("mpi_communication") #endif /* WITH_MPI */ endif ! Request last column from next PE ne = na_s + nblocks*nb - (max_threads-1) - 1 if (istep>=max_threads .and. ne <= na) then #ifdef WITH_MPI call timer%start("mpi_communication") call mpi_recv(ab(1,ne-n_off), nb+1, MPI_COMPLEX_EXPLICIT_PRECISION, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr) call timer%stop("mpi_communication") #else /* WITH_MPI */ ab(1:nb+1,ne-n_off) = ab_s(1:nb+1) #endif /* WITH_MPI */ endif else ! We are at the end of all blocks ! Send last HH vector and TAU to next PE if it has been calculated above ne = na_s + nblocks*nb - (max_threads-1) - 1 if (istep>=max_threads .and. ne < na) then #ifdef WITH_MPI call timer%start("mpi_communication") call mpi_wait(ireq_hv, MPI_STATUS_IGNORE,mpierr) call timer%stop("mpi_communication") #endif hv_s(1) = tau_t(max_threads) hv_s(2:) = hv_t(2:,max_threads) #ifdef WITH_MPI call timer%start("mpi_communication") call mpi_isend(hv_s, nb, MPI_COMPLEX_EXPLICIT_PRECISION, my_pe+1, 2, mpi_comm, ireq_hv, mpierr) call timer%stop("mpi_communication") #endif /* WITH_MPI */ endif ! "Send" HH vector and TAU to next OpenMP thread do my_thread = max_threads, 2, -1 hv_t(:,my_thread) = hv_t(:,my_thread-1) tau_t(my_thread) = tau_t(my_thread-1) enddo endif enddo ! iter else ! Codepath for 1 thread without OpenMP ! The following code is structured in a way to keep waiting times for ! other PEs at a minimum, especially if there is only one block. ! For this reason, it requests the last column as late as possible ! and sends the Householder vector and the first column as early ! as possible. #endif /* WITH_OPENMP */ !#ifdef WITH_GPU_VERSION ! call cpu_time(start) !#endif do iblk=1,nblocks ns = na_s + (iblk-1)*nb - n_off ! first column in block ne = ns+nb-1 ! last column in block if (ns+n_off>na) exit ! Store Householder vector for back transformation hh_cnt(iblk) = hh_cnt(iblk) + 1 hh_gath(1 ,hh_cnt(iblk),iblk) = tau hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb) #ifndef WITH_OPENMP if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then ! Wait for last transfer to finish #ifdef WITH_MPI call timer%start("mpi_communication") call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr) call timer%stop("mpi_communication") #endif ! Copy vectors into send buffer hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk) ! Send to destination #ifdef WITH_MPI call timer%start("mpi_communication") call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), MPI_COMPLEX_EXPLICIT_PRECISION, & global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1,np_cols)), & 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) call timer%stop("mpi_communication") #else /* WITH_MPI */ startAddr = startAddr - hh_cnt(iblk) hh_trans_complex(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk) #endif /* WITH_MPI */ ! Reset counter and increase destination row hh_cnt(iblk) = 0 hh_dst(iblk) = hh_dst(iblk)+1 endif ! The following code is structured in a way to keep waiting times for ! other PEs at a minimum, especially if there is only one block. ! For this reason, it requests the last column as late as possible ! and sends the Householder vector and the first column as early ! as possible. #endif /* OpenMP */ nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) ! Note that nr>=0 implies that diagonal block is full (nc==nb)! ! Multiply diagonal block and subdiagonal block with Householder vector if (iblk==nblocks .and. nc==nb) then ! We need the last column from the next PE. ! First do the matrix multiplications without last column ... ! Diagonal block, the contribution of the last element is added below ab(1,ne) = CONST_COMPLEX_0_0 call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1,CONST_COMPLEX_PAIR_0_0,hd,1) ! Subdiagonal block if (nr>0) call PRECISION_GEMV('N', nr, nb-1, tau, ab(nb+1,ns), 2*nb-1, hv, 1,CONST_COMPLEX_PAIR_0_0,hs,1) ! ... then request last column ... ! ... then request last column ... #ifdef WITH_MPI call timer%start("mpi_communication") #ifdef WITH_OPENMP call mpi_recv(ab(1,ne), nb+1, MPI_COMPLEX_EXPLICIT_PRECISION, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr) #else call mpi_recv(ab(1,ne), nb+1, MPI_COMPLEX_EXPLICIT_PRECISION, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr) #endif call timer%stop("mpi_communication") #else /* WITH_MPI */ ab(1:nb+1,ne) = ab_s(1:nb+1) #endif /* WITH_MPI */ ! ... and complete the result hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb) hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau else ! Normal matrix multiply call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, CONST_COMPLEX_PAIR_0_0, hd, 1) if (nr>0) call PRECISION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, CONST_COMPLEX_PAIR_0_0, hs, 1) endif ! Calculate first column of subdiagonal block and calculate new ! Householder transformation for this column hv_new(:) = 0 ! Needed, last rows must be 0 for nr < nb !#ifdef WITH_GPU_VERSION ! istat = cuda_memset(hv_new_dev, 0,nb*size_of_complex_datatype ) ! if (istat .ne. 0) print *, " cuda memset failed hv_new_dev", istat !#endif tau_new = 0 if (nr>0) then ! complete (old) Householder transformation for first column ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1 ! calculate new Householder transformation ... if (nr>1) then #ifdef DOUBLE_PRECISION_COMPLEX vnorm2 = sum(real(ab(nb+2:nb+nr,ns),kind=rk8)**2+dimag(ab(nb+2:nb+nr,ns))**2) #else vnorm2 = sum(real(ab(nb+2:nb+nr,ns),kind=rk4)**2+aimag(ab(nb+2:nb+nr,ns))**2) #endif call hh_transform_complex_PRECISION(ab(nb+1,ns),vnorm2,hf,tau_new) hv_new(1) = CONST_COMPLEX_1_0 hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf ab(nb+2:,ns) = CONST_COMPLEX_0_0 endif ! ... and send it away immediatly if this is the last block if (iblk==nblocks) then #ifdef WITH_MPI call timer%start("mpi_communication") #ifdef WITH_OPENMP call mpi_wait(ireq_hv, MPI_STATUS_IGNORE,mpierr) #else call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) #endif call timer%stop("mpi_communication") #endif /* WITH_MPI */ hv_s(1) = tau_new hv_s(2:) = hv_new(2:) #ifdef WITH_MPI call timer%start("mpi_communication") call mpi_isend(hv_s, nb, MPI_COMPLEX_EXPLICIT_PRECISION, my_pe+1, 2 ,mpi_comm, ireq_hv, mpierr) call timer%stop("mpi_communication") #endif /* WITH_MPI */ endif endif ! Transform diagonal block x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau) hd(1:nc) = hd(1:nc) - CONST_REAL_0_5*x*hv(1:nc) !#ifdef WITH_GPU_VERSION ! istat = cuda_memcpy2d((ab_dev + (ns-1)*2*nb*size_of_complex_datatype), 2*nb*size_of_complex_datatype,loc(a(1,ns)), 2*nb*size_of_complex_datatype, 2*size_of_complex_datatype , & ! 2*nb*size_of_complex_datatype,cudaMemcpyHostToDevice) ! if (istat .ne. 0) print *, "cuda memcpy a_dev H2D failed ", istat ! istat =cuda_memcpy(hv_dev,loc(hv),nc*size_of_complex_datatype,cudaMemcpyHostToDevice) ! if (istat .ne. 0) print *,"cuda memcpy failed hv_dev", istat ! istat =cuda_memcpy(hd_dev,loc(hd), nb*size_of_complex_datatype,cudaMemcpyHostToDevice) ! if (istat .ne. 0) print *,"cuda memcpy failed hd_dev", istat !#endif if (my_pe>0 .and. iblk==1) then ! The first column of the diagonal block has to be send to the previous PE ! Calculate first column only ... !#ifdef WITH_GPU_VERSION ! call double_hh_transform_2( ns, nc, nb ) ! istat=cuda_memcpy(loc(ab),ab_dev,(2*nb*(nblocks+1)*nb)*size_of_complex_datatype,cudaMemcpyDeviceToHost) ! if (istat .ne. 0) print *, " cuda memcpy failed ab ", istat !#else ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*conjg(hv(1)) - hv(1:nc)*conjg(hd(1)) !#endif ! ... send it away ... #ifdef WITH_MPI call timer%start("mpi_communication") #ifdef WITH_OPENMP call mpi_wait(ireq_ab, MPI_STATUS_IGNORE,mpierr) #else call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) #endif call timer%stop("mpi_communication") #endif /* WITH_MPI */ ab_s(1:nb+1) = ab(1:nb+1,ns) #ifdef WITH_MPI call timer%start("mpi_communication") call mpi_isend(ab_s, nb+1, MPI_COMPLEX_EXPLICIT_PRECISION, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) call timer%stop("mpi_communication") #endif /* WITH_MPI */ ! ... and calculate remaining columns with rank-2 update if (nc>1) then !#ifdef WITH_GPU_VERSION ! call cublas_PRECISION_HER2( 'L',nc -1,(-1.d0,0.d0), hd_dev + 1*16, 1, hv_dev +1*16, 1 , ab_dev + (ns*2*nb )*16, 2*nb-1) !#else call PRECISION_HER2('L', nc-1, CONST_COMPLEX_PAIR_NEGATIVE_1_0, hd(2), 1, hv(2), 1, ab(1,ns+1), 2*nb-1) !#endif endif else ! No need to send, just a rank-2 update !#ifdef WITH_GPU_VERSION ! call cublas_PRECISION_HER2( 'L',nc ,(-1.d0,0.d0), hd_dev, 1, hv_dev, 1 , ab_dev + ((ns-1)*2*nb )*16, 2*nb-1) !#else call PRECISION_HER2('L', nc, CONST_COMPLEX_PAIR_NEGATIVE_1_0, hd, 1, hv, 1, ab(1,ns), 2*nb-1) !#endif endif !#ifdef WITH_GPU_VERSION ! istat=cuda_memcpy( loc(hd),hd_dev,nb*size_of_complex_datatype,cudaMemcpyDeviceToHost) ! if (istat .ne. 0) print *,"cuda memcpy failed hd_dev", istat !#endif ! Do the remaining double Householder transformation on the subdiagonal block cols 2 ... nb !#ifdef WITH_GPU_VERSION ! istat =cuda_memcpy(hs_dev,loc(hs),nb*size_of_complex_datatype,cudaMemcpyHostToDevice) ! if (istat .ne. 0) print *,"cuda memcpy failed hs_dev", istat !#endif if (nr>0) then if (nr>1) then !#ifdef WITH_GPU_VERSION ! istat = cuda_memcpy(hv_new_dev,loc(hv_new),nb*size_of_complex_datatype,cudaMemcpyHostToDevice) ! if (istat .ne. 0) print *,"cuda memcpy failed hv_new_dev", istat ! ! istat = cuda_memcpy(h_dev,loc(h),nb*size_of_complex_datatype,cudaMemcpyHostToDevice) ! if (istat .ne. 0) print *,"cuda memcpy failed h_dev", istat ! ! call cublas_PRECISION_GEMV('C',nr,nb-1,tau_new,ab_dev + (nb-1 + ns *2*nb)*16,2*nb-1,hv_new_dev,1,(0.d0,0.d0),h_dev + 1* 16,1) ! ! istat = cuda_memcpy(tau_new_dev,loc(tau_new),1*size_of_complex_datatype,cudaMemcpyHostToDevice) ! if (istat .ne. 0) print *,"cuda memcpy failed tau_new_dev", istat ! ! call dot_product_kernel(nr , tau_new) ! call dot_product_kernel_1( nb, nr , ns) ! ! istat = cuda_memcpy(loc(x),x_dev,1*size_of_complex_datatype,cudaMemcpyDeviceToHost) ! if (istat .ne. 0) print *, " cuda memcpy failed x_dev ", istat ! ! istat =cuda_memcpy(loc(h),h_dev,nb*size_of_complex_datatype,cudaMemcpyDeviceToHost) ! if (istat .ne. 0) print *, " cuda memcpy failed h ", istat !#else call PRECISION_GEMV('C', nr, nb-1, tau_new, ab(nb,ns+1), 2*nb-1, hv_new, 1, CONST_COMPLEX_PAIR_0_0, h(2), 1) x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new h(2:nb) = h(2:nb) - x*hv(2:nb) ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update do i=2,nb ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*conjg(h(i)) - hs(1:nr)*conjg(hv(i)) enddo !#endif else ! No double Householder transformation for nr=1, just complete the row !#ifdef WITH_GPU_VERSION ! call double_hh_transform_1(nb, ns) !#else do i=2,nb ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i)) enddo !#endif endif endif ! Use new HH vector for the next block hv(:) = hv_new(:) tau = tau_new enddo !#ifdef WITH_GPU_VERSION ! call cpu_time(finish) ! tstep2 = finish-start ! t_2 = t_2 + tstep2 !#endif #ifdef WITH_OPENMP endif #endif #ifdef WITH_OPENMP do iblk = 1, nblocks if (hh_dst(iblk) >= np_rows) exit if (snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then ! Wait for last transfer to finish #ifdef WITH_MPI call timer%start("mpi_communication") call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr) call timer%stop("mpi_communication") #endif ! Copy vectors into send buffer hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk) ! Send to destination #ifdef WITH_MPI call timer%start("mpi_communication") call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), MPI_COMPLEX_EXPLICIT_PRECISION, & global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1, np_cols)), & 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) call timer%stop("mpi_communication") #else /* WITH_MPI */ startAddr = startAddr - hh_cnt(iblk) hh_trans_complex(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk) #endif /* WITH_MPI */ ! Reset counter and increase destination row hh_cnt(iblk) = 0 hh_dst(iblk) = hh_dst(iblk)+1 endif enddo #endif /* WITH_OPENMP */ enddo !#ifdef WITH_GPU_VERSION ! call cpu_time(finish_1) ! tstep1 = finish_1-start_1 ! t_1 = t_1 + tstep1 !#endif ! Finish the last outstanding requests #ifdef WITH_OPENMP #ifdef WITH_MPI call timer%start("mpi_communication") call mpi_wait(ireq_ab, MPI_STATUS_IGNORE,mpierr) call mpi_wait(ireq_hv, MPI_STATUS_IGNORE,mpierr) ! allocate(mpi_statuses(MPI_STATUS_SIZE,max(nblocks,num_chunks)), stat=istat, errmsg=errorMessage) ! if (istat .ne. 0) then ! print *,"tridiag_band_complex: error when allocating mpi_statuses "//errorMessage ! stop ! endif call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr) call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr) ! deallocate(mpi_statuses, stat=istat, errmsg=errorMessage) ! if (istat .ne. 0) then ! print *,"tridiag_band_complex: error when deallocating mpi_statuses "//errorMessage ! stop ! endif call timer%stop("mpi_communication") #endif /* WITH_MPI */ #else /* WITH_OPENMP */ #ifdef WITH_MPI call timer%start("mpi_communication") call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr) call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr) call timer%stop("mpi_communication") #endif #endif /* WITH_OPENMP */ #ifdef WITH_MPI call timer%start("mpi_communication") call mpi_barrier(mpi_comm,mpierr) call timer%stop("mpi_communication") #endif deallocate(ab, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when deallocating ab "//errorMessage stop endif !#ifdef WITH_GPU_VERSION ! deallocate(ab_temp, stat=istat, errmsg=errorMessage) ! if (istat .ne. 0) then ! print *,"error when deallocating ab_temp "//errorMessage ! stop ! endif ! !#endif deallocate(ireq_hhr, ireq_hhs, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when deallocating ireq_hhr, ireq_hhs "//errorMessage stop endif deallocate(hh_cnt, hh_dst, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when deallocating hh_cnt, hh_dst "//errorMessage stop endif deallocate(hh_gath, hh_send, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when deallocating hh_gath, hh_send, "//errorMessage stop endif deallocate(limits, snd_limits, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when deallocating limits, snd_limits "//errorMessage stop endif deallocate(block_limits, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when deallocating block_limits, "//errorMessage stop endif deallocate(global_id, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_band_complex: error when deallocating global_id, "//errorMessage stop endif !#ifdef WITH_GPU_VERSION ! istat = cuda_free(ab_dev) ! if (istat .ne. 0) then ! print *,"error in cudaFree" ! stop ! endif ! ! istat = cuda_free(hv_new_dev) ! if (istat .ne. 0) then ! print *,"error in cudaFree" ! stop ! endif ! ! istat = cuda_free(hs_dev) ! if (istat .ne. 0) then ! print *,"error in cudaFree" ! stop ! endif ! ! istat = cuda_free(h_dev) ! if (istat .ne. 0) then ! print *,"error in cudaFree" ! stop ! endif ! ! istat = cuda_free(tau_new_dev) ! if (istat .ne. 0) then ! print *,"error in cudaFree" ! stop ! endif ! ! istat = cuda_free(x_dev) ! if (istat .ne. 0) then ! print *,"error in cudaFree" ! stop ! endif ! !#endif call timer%stop("tridiag_band_complex_PRECISION") !#ifdef WITH_GPU_VERSION ! contains ! ! subroutine dot_product_kernel(nr,tau_new) ! implicit none ! integer, intent(in) :: nr ! complex*16, intent(in) :: tau_new ! ! call launch_dot_product_kernel( hs_dev,hv_new_dev,tau_new,x_dev,h_dev,hv_dev, nr ) ! end subroutine ! ! subroutine dot_product_kernel_1( nb , nr , ns) ! implicit none ! integer, intent(in) :: nb, nr, ns ! ! call launch_dot_product_kernel_1(ab_dev,hs_dev, hv_new_dev,x_dev,h_dev,hv_dev,nb , nr, ns) ! end subroutine ! ! subroutine double_hh_transform_1( nb , ns) ! implicit none ! integer, intent(in) :: nb, ns ! ! call launch_double_hh_transform_1(ab_dev,hs_dev,hv_dev,nb , ns) ! end subroutine ! ! subroutine double_hh_transform_2( ns,nc, nb) ! implicit none ! integer, intent(in) :: nc, ns, nb ! ! call launch_double_hh_transform_2(ab_dev,hd_dev,hv_dev,nc , ns, nb) ! end subroutine !#endif end subroutine tridiag_band_complex_PRECISION ! has to be checked for GPU