! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! This particular source code file contains additions, changes and ! enhancements authored by Intel Corporation which is not part of ! the ELPA consortium. ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". #include "../general/sanity.X90" subroutine trans_ev_tridi_to_band_& &MATH_DATATYPE& &_& &PRECISION & (obj, na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, & hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, & kernel) !------------------------------------------------------------------------------- ! trans_ev_tridi_to_band_real/complex: ! Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix ! ! Parameters ! ! na Order of matrix a, number of rows of matrix q ! ! nev Number eigenvectors to compute (= columns of matrix q) ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! nb semi bandwith ! ! q On input: Eigenvectors of tridiagonal matrix ! On output: Transformed eigenvectors ! Distribution is like in Scalapack. ! ! q_dev GPU device pointer to q ! ! ldq Leading dimension of q ! matrixCols local columns of matrix q ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns/both ! !------------------------------------------------------------------------------- use elpa_abstract_impl use elpa2_workload use pack_unpack_cpu use pack_unpack_gpu use compute_hh_trafo use cuda_functions use precision use iso_c_binding implicit none class(elpa_abstract_impl_t), intent(inout) :: obj logical, intent(in) :: useGPU integer(kind=ik), intent(in) :: kernel integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols #if REALCASE == 1 #ifdef USE_ASSUMED_SIZE real(kind=REAL_DATATYPE) :: q(ldq,*) #else real(kind=REAL_DATATYPE) :: q(ldq,matrixCols) #endif real(kind=REAL_DATATYPE), intent(in) :: hh_trans(:,:) #endif integer(kind=c_intptr_t) :: q_dev #if COMPLEXCASE == 1 #ifdef USE_ASSUMED_SIZE complex(kind=COMPLEX_DATATYPE) :: q(ldq,*) #else complex(kind=COMPLEX_DATATYPE) :: q(ldq,matrixCols) #endif complex(kind=COMPLEX_DATATYPE) :: hh_trans(:,:) #endif integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol integer(kind=ik) :: tmp integer(kind=ik) :: i, j, ip, sweep, nbuf, l_nev, a_dim2 integer(kind=ik) :: current_n, current_local_n, current_n_start, current_n_end integer(kind=ik) :: next_n, next_local_n, next_n_start, next_n_end integer(kind=ik) :: bottom_msg_length, top_msg_length, next_top_msg_length integer(kind=ik) :: stripe_width, last_stripe_width, stripe_count #ifdef WITH_OPENMP integer(kind=ik) :: thread_width, csw, b_off, b_len #endif integer(kind=ik) :: num_result_blocks, num_result_buffers, num_bufs_recvd integer(kind=ik) :: a_off, current_tv_off, max_blk_size integer(kind=ik) :: mpierr, src, src_offset, dst, offset, nfact, num_blk logical :: flag #if REALCASE == 1 #ifdef WITH_OPENMP real(kind=REAL_DATATYPE), pointer :: aIntern(:,:,:,:) #else real(kind=REAL_DATATYPE), pointer :: aIntern(:,:,:) #endif real(kind=REAL_DATATYPE) :: a_real #endif #if COMPLEXCASE == 1 #ifdef WITH_OPENMP complex(kind=COMPLEX_DATATYPE), pointer :: aIntern(:,:,:,:) #else complex(kind=COMPLEX_DATATYPE), pointer :: aIntern(:,:,:) #endif complex(kind=COMPLEX_DATATYPE) :: a_complex #endif type(c_ptr) :: aIntern_ptr #if REALCASE == 1 real(kind=REAL_DATATYPE) , allocatable :: row(:) real(kind=REAL_DATATYPE) , allocatable :: row_group(:,:) #endif #if COMPLEXCASE == 1 complex(kind=COMPLEX_DATATYPE), allocatable :: row(:) complex(kind=COMPLEX_DATATYPE), allocatable :: row_group(:,:) #endif #if REALCASE == 1 #ifdef WITH_OPENMP real(kind=REAL_DATATYPE), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:) real(kind=REAL_DATATYPE), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:) #else real(kind=REAL_DATATYPE), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:) real(kind=REAL_DATATYPE), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:) #endif #endif #if COMPLEXCASE == 1 #ifdef WITH_OPENMP complex(kind=COMPLEX_DATATYPE), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:) complex(kind=COMPLEX_DATATYPE), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:) #else complex(kind=COMPLEX_DATATYPE), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:) complex(kind=COMPLEX_DATATYPE), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:) #endif #endif integer(kind=c_intptr_t) :: aIntern_dev integer(kind=c_intptr_t) :: bcast_buffer_dev integer(kind=c_intptr_t) :: num integer(kind=c_intptr_t) :: dev_offset, dev_offset_1, dev_offset_2 integer(kind=c_intptr_t) :: row_dev integer(kind=c_intptr_t) :: row_group_dev integer(kind=c_intptr_t) :: hh_tau_dev integer(kind=c_intptr_t) :: hh_dot_dev integer(kind=ik) :: row_group_size, unpack_idx integer(kind=ik) :: n_times integer(kind=ik) :: top, chunk, this_chunk #if REALCASE == 1 real(kind=REAL_DATATYPE), allocatable :: result_buffer(:,:,:) real(kind=REAL_DATATYPE), allocatable :: bcast_buffer(:,:) #endif #if COMPLEXCASE == 1 complex(kind=COMPLEX_DATATYPE), allocatable :: result_buffer(:,:,:) complex(kind=COMPLEX_DATATYPE), allocatable :: bcast_buffer(:,:) #endif integer(kind=ik) :: n_off integer(kind=ik), allocatable :: result_send_request(:), result_recv_request(:), limits(:) integer(kind=ik), allocatable :: top_send_request(:), bottom_send_request(:) integer(kind=ik), allocatable :: top_recv_request(:), bottom_recv_request(:) #ifdef WITH_OPENMP ! integer(kind=ik), allocatable :: mpi_statuses(:,:) #endif #ifdef WITH_OPENMP #ifdef WITH_MPI ! integer(kind=ik) :: my_MPI_STATUS_(MPI_STATUS_SIZE) #endif #endif #if COMPLEXCASE == 1 #ifdef WITH_MPI integer(kind=ik), external :: numroc #endif integer(kind=ik) :: na_rows, na_cols #endif ! MPI send/recv tags, arbitrary integer(kind=ik), parameter :: bottom_recv_tag = 111 integer(kind=ik), parameter :: top_recv_tag = 222 integer(kind=ik), parameter :: result_recv_tag = 333 #ifdef WITH_OPENMP integer(kind=ik) :: max_threads, my_thread integer(kind=ik) :: omp_get_max_threads #endif ! Just for measuring the kernel performance real(kind=c_double) :: kernel_time, kernel_time_recv ! MPI_WTIME always needs double ! long integer integer(kind=lik) :: kernel_flops, kernel_flops_recv logical, intent(in) :: wantDebug logical :: success integer(kind=ik) :: istat, print_flops character(200) :: errorMessage logical :: successCUDA #ifndef WITH_MPI integer(kind=ik) :: j1 #endif integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_& &PRECISION& &_& &MATH_DATATYPE call obj%timer%start("trans_ev_tridi_to_band_& &MATH_DATATYPE& &" // & &PRECISION_SUFFIX & ) if (useGPU) then #if COMPLEXCASE == 1 n_times = 0 #endif unpack_idx = 0 row_group_size = 0 endif success = .true. kernel_time = 0.0 kernel_flops = 0 #ifdef WITH_OPENMP max_threads = 1 max_threads = omp_get_max_threads() #endif call obj%timer%start("mpi_communication") call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr) call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr) call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr) call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr) call obj%timer%stop("mpi_communication") #if COMPLEXCASE == 1 if (useGPU) then #ifdef WITH_MPI na_rows = numroc(na, nblk, my_prow, 0, np_rows) na_cols = numroc(na, nblk, my_pcol, 0, np_cols) #else na_rows = na na_cols = na #endif endif #endif /* COMPLEXCASE */ if (mod(nbw,nblk)/=0) then if (my_prow==0 .and. my_pcol==0) then if (wantDebug) then write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_& &MATH_DATATYPE& &: ERROR: nbw=',nbw,', nblk=',nblk write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_& &MATH_DATATYPE& &: band backtransform works only for nbw==n*nblk' endif success = .false. return endif endif nfact = nbw / nblk ! local number of eigenvectors l_nev = local_index(nev, my_pcol, np_cols, nblk, -1) if (l_nev==0) then #ifdef WITH_OPENMP thread_width = 0 #endif stripe_width = 0 stripe_count = 0 last_stripe_width = 0 else ! l_nev #if WITH_OPENMP ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into ! every primary cache ! Suggested stripe width is 48 - should this be reduced for the complex case ??? if (useGPU) then stripe_width = 256 ! Must be a multiple of 4 stripe_count = (l_nev - 1) / stripe_width + 1 else ! useGPU ! openmp only in non-GPU case thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread #if REALCASE == 1 #ifdef DOUBLE_PRECISION_REAL stripe_width = 48 ! Must be a multiple of 4 #else stripe_width = 96 ! Must be a multiple of 8 #endif #endif /* REALCASE */ #if COMPLEXCASE == 1 #ifdef DOUBLE_PRECISION_COMPLEX stripe_width = 48 ! Must be a multiple of 2 #else stripe_width = 48 ! Must be a multiple of 4 #endif #endif /* COMPLEXCASE */ stripe_count = (thread_width-1)/stripe_width + 1 ! Adapt stripe width so that last one doesn't get too small stripe_width = (thread_width-1)/stripe_count + 1 #if REALCASE == 1 #ifdef DOUBLE_PRECISION_REAL if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. & kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. & kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes ! (8 * sizeof(double) == 64) else stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes ! (4 * sizeof(double) == 32) endif #else if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. & kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. & kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes ! (16 * sizeof(float) == 64) else stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes ! (8 * sizeof(float) == 32) endif #endif #endif /* REALCASE */ #if COMPLEXCASE == 1 #ifdef DOUBLE_PRECISION_COMPLEX if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. & kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes ! (4 * sizeof(double complex) == 64) else stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes ! (2 * sizeof(double complex) == 32) endif #else if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. & kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes ! (8 * sizeof(float complex) == 64) else stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes ! (4 * sizeof(float complex) == 32) endif #endif #endif /* COMPLEXCASE */ #if REALCASE == 1 last_stripe_width = l_nev - (stripe_count-1)*stripe_width #endif #if COMPLEXCASE == 1 ! only needed in no OMP case check thsis ! last_stripe_width = l_nev - (stripe_count-1)*stripe_width #endif endif ! useGPU #else /* WITH_OPENMP */ ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into ! every primary cache ! Suggested stripe width is 48 - should this be reduced for the complex case ??? if (useGPU) then stripe_width = 256 ! Must be a multiple of 4 stripe_count = (l_nev - 1) / stripe_width + 1 else ! useGPU #if REALCASE == 1 #ifdef DOUBLE_PRECISION_REAL stripe_width = 48 ! Must be a multiple of 4 #else stripe_width = 96 ! Must be a multiple of 8 #endif #endif /* REALCASE */ #if COMPLEXCASE == 1 #ifdef DOUBLE_PRECISION_COMPLEX stripe_width = 48 ! Must be a multiple of 2 #else stripe_width = 48 ! Must be a multiple of 4 #endif #endif /* COMPLEXCASE */ stripe_count = (l_nev-1)/stripe_width + 1 ! Adapt stripe width so that last one doesn't get too small stripe_width = (l_nev-1)/stripe_count + 1 #if REALCASE == 1 #ifdef DOUBLE_PRECISION_REAL if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. & kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. & kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes ! (8 * sizeof(double) == 64) else stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes ! (4 * sizeof(double) == 32) endif #else if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. & kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. & kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes ! (16 * sizeof(float) == 64) else stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes ! (8 * sizeof(float) == 32) endif #endif #endif /* REALCASE */ #if COMPLEXCASE == 1 #ifdef DOUBLE_PRECISION_COMPLEX if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. & kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes ! (4 * sizeof(double complex) == 64) else stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes ! (2 * sizeof(double complex) == 32) endif #else if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. & kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes ! (8 * sizeof(float complex) == 64) else stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes ! (4 * sizeof(float complex) == 32) endif #endif #endif /* COMPLEXCASE */ endif ! useGPU last_stripe_width = l_nev - (stripe_count-1)*stripe_width #endif /* WITH_OPENMP */ endif ! l_nev ! Determine the matrix distribution at the beginning allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when allocating limits"//errorMessage stop 1 endif call determine_workload(obj,na, nbw, np_rows, limits) max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1)) a_dim2 = max_blk_size + nbw if (useGPU) then num = (stripe_width*a_dim2*stripe_count)* size_of_datatype successCUDA = cuda_malloc(aIntern_dev, stripe_width*a_dim2*stripe_count* size_of_datatype) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaMalloc"//errorMessage stop 1 endif successCUDA = cuda_memset(aIntern_dev , 0, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaMemset"//errorMessage stop 1 endif num = (l_nev)* size_of_datatype successCUDA = cuda_malloc( row_dev,num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaMalloc " stop 1 endif successCUDA = cuda_memset(row_dev , 0, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaMemset " stop 1 endif ! "row_group" and "row_group_dev" are needed for GPU optimizations allocate(row_group(l_nev, nblk), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when allocating row_group"//errorMessage stop 1 endif #if REALCASE == 1 row_group(:, :) = CONST_0_0 #endif #if COMPLEXCASE == 1 row_group(:, :) = CONST_COMPLEX_0_0 #endif num = (l_nev*nblk)* size_of_datatype successCUDA = cuda_malloc(row_group_dev, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaMalloc"//errorMessage stop 1 endif successCUDA = cuda_memset(row_group_dev , 0, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaMemset"//errorMessage stop 1 endif else ! GPUs are not used #if 0 ! realcase or complexcase !DEC$ ATTRIBUTES ALIGN: 64:: aIntern #endif #ifdef WITH_OPENMP if (posix_memalign(aIntern_ptr, 64_c_intptr_t, stripe_width*a_dim2*stripe_count*max_threads* & #if REALCASE == 1 C_SIZEOF(a_real)) /= 0) then #endif #if COMPLEXCASE == 1 C_SIZEOF(a_complex)) /= 0) then #endif print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when allocating aIntern"//errorMessage stop 1 endif call c_f_pointer(aIntern_ptr, aIntern, [stripe_width,a_dim2,stripe_count,max_threads]) ! allocate(aIntern(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage) ! aIntern(:,:,:,:) should be set to 0 in a parallel region, not here! #else /* WITH_OPENMP */ if (posix_memalign(aIntern_ptr, 64_c_intptr_t, stripe_width*a_dim2*stripe_count* & #if REALCASE == 1 C_SIZEOF(a_real)) /= 0) then #endif #if COMPLEXCASE == 1 C_SIZEOF(a_complex)) /= 0) then #endif print *,"trans_ev_tridi_to_band_real: error when allocating aIntern"//errorMessage stop 1 endif call c_f_pointer(aIntern_ptr, aIntern,[stripe_width,a_dim2,stripe_count] ) !allocate(aIntern(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage) #if REALCASE == 1 aIntern(:,:,:) = CONST_0_0 #endif #if COMPLEXCASE == 1 aIntern(:,:,:) = 0 #endif #endif /* WITH_OPENMP */ endif !useGPU allocate(row(l_nev), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when allocating row"//errorMessage stop 1 endif #if REALCASE == 1 row(:) = CONST_0_0 #endif #if COMPLEXCASE == 1 row(:) = 0 #endif ! Copy q from a block cyclic distribution into a distribution with contiguous rows, ! and transpose the matrix using stripes of given stripe_width for cache blocking. ! The peculiar way it is done below is due to the fact that the last row should be ! ready first since it is the first one to start below #ifdef WITH_OPENMP ! Please note about the OMP usage below: ! This is not for speed, but because we want the matrix a in the memory and ! in the cache of the correct thread (if possible) call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX) !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads #if REALCASE == 1 aIntern(:,:,:,my_thread) = CONST_0_0 ! if possible, do first touch allocation! #endif #if COMPLEXCASE == 1 aIntern(:,:,:,my_thread) = CONST_COMPLEX_0_0 ! if possible, do first touch allocation! #endif enddo !$omp end parallel do call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX) #endif /* WITH_OPENMP */ do ip = np_rows-1, 0, -1 if (my_prow == ip) then ! Receive my rows which have not yet been received src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1) do i=limits(ip)+1,limits(ip+1) src = mod((i-1)/nblk, np_rows) if (src < my_prow) then #ifdef WITH_OPENMP #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Recv(row, l_nev, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ ! row(1:l_nev) = row(1:l_nev) #endif /* WITH_MPI */ call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX) !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads call unpack_row_& &MATH_DATATYPE& &_cpu_openmp_& &PRECISION & (obj,aIntern, row, i-limits(ip), my_thread, stripe_count, & thread_width, stripe_width, l_nev) enddo !$omp end parallel do call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX) #else /* WITH_OPENMP */ if (useGPU) then ! An unpacking of the current row group may occur before queuing the next row call unpack_and_prepare_row_group_& &MATH_DATATYPE& &_gpu_& &PRECISION & ( & row_group, row_group_dev, aIntern_dev, stripe_count, & stripe_width, last_stripe_width, a_dim2, l_nev,& row_group_size, nblk, unpack_idx, & i - limits(ip), .false.) #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Recv(row_group(:, row_group_size), l_nev, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct? #endif /* WITH_MPI */ else ! useGPU #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Recv(row, l_nev, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ ! row(1:l_nev) = row(1:l_nev) #endif /* WITH_MPI */ call unpack_row_& &MATH_DATATYPE& &_cpu_& &PRECISION & (obj,aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width) endif ! useGPU #endif /* WITH_OPENMP */ elseif (src == my_prow) then src_offset = src_offset+1 if (useGPU) then #ifndef WITH_OPENMP ! An unpacking of the current row group may occur before queuing the next row call unpack_and_prepare_row_group_& &MATH_DATATYPE& &_gpu_& &PRECISION & ( & row_group, row_group_dev, aIntern_dev, stripe_count, & stripe_width, last_stripe_width, a_dim2, l_nev,& row_group_size, nblk, unpack_idx, & i - limits(ip), .false.) #if REALCASE == 1 row_group(:, row_group_size) = q(src_offset, 1:l_nev) #endif #if COMPLEXCASE == 1 row_group(:, row_group_size) = q(src_offset, 1:l_nev) #endif #else /* WITH_OPENMP */ #if COMPLEXCASE == 1 ! why is an cuda call in the openmp region? call unpack_and_prepare_row_group_complex_gpu_& &PRECISION& &(row_group, row_group_dev, aIntern_dev, stripe_count, stripe_width, & last_stripe_width, a_dim2, l_nev, row_group_size, nblk, & unpack_idx, i - limits(ip),.false.) row_group(:, row_group_size) = q(src_offset, 1:l_nev) #endif #endif /* not OpenMP */ else row(:) = q(src_offset, 1:l_nev) endif #ifdef WITH_OPENMP call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX) !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads call unpack_row_& &MATH_DATATYPE& &_cpu_openmp_& &PRECISION & (obj,aIntern, row, i-limits(ip), my_thread, stripe_count, thread_width, stripe_width, l_nev) enddo !$omp end parallel do call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX) #else /* WITH_OPENMP */ if (useGPU) then else call unpack_row_& &MATH_DATATYPE& &_cpu_& &PRECISION & (obj,aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width) endif #endif /* WITH_OPENMP */ endif enddo ! Send all rows which have not yet been send src_offset = 0 do dst = 0, ip-1 do i=limits(dst)+1,limits(dst+1) if (mod((i-1)/nblk, np_rows) == my_prow) then src_offset = src_offset+1 row(:) = q(src_offset, 1:l_nev) #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Send(row, l_nev, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif dst, 0, mpi_comm_rows, mpierr) call obj%timer%stop("mpi_communication") #endif /* WITH_MPI */ endif enddo enddo else if (my_prow < ip) then ! Send all rows going to PE ip src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1) do i=limits(ip)+1,limits(ip+1) src = mod((i-1)/nblk, np_rows) if (src == my_prow) then src_offset = src_offset+1 row(:) = q(src_offset, 1:l_nev) #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Send(row, l_nev, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif ip, 0, mpi_comm_rows, mpierr) call obj%timer%stop("mpi_communication") #endif /* WITH_MPI */ endif enddo ! Receive all rows from PE ip do i=limits(my_prow)+1,limits(my_prow+1) src = mod((i-1)/nblk, np_rows) if (src == ip) then #ifdef WITH_OPENMP #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Recv(row, l_nev, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ ! row(1:l_nev) = row(1:l_nev) #endif /* WITH_MPI */ call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX) !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads call unpack_row_& &MATH_DATATYPE& &_cpu_openmp_& &PRECISION & (obj,aIntern, row, i-limits(my_prow), my_thread, stripe_count, thread_width, stripe_width, l_nev) enddo !$omp end parallel do call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX) #else /* WITH_OPENMP */ if (useGPU) then ! An unpacking of the current row group may occur before queuing the next row call unpack_and_prepare_row_group_& &MATH_DATATYPE& &_gpu_& &PRECISION& &( & row_group, row_group_dev, aIntern_dev, stripe_count, & stripe_width, last_stripe_width, a_dim2, l_nev, & row_group_size, nblk, unpack_idx, & i - limits(my_prow), .false.) #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Recv(row_group(:, row_group_size), l_nev, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ #if REALCASE == 1 row_group(1:l_nev,row_group_size) = row(1:l_nev) ! is this correct ? #endif #if COMPLEXCASE == 1 ! todo: what of this is correct? Was different for double/single precision #ifdef DOUBLE_PRECISION_COMPLEX row_group(1:l_nev,row_group_size) = row(1:l_nev) ! is this correct ? #else row_group(1:l_nev,row_group_size) = row_group(1:l_nev,row_group_size) ! is this correct #endif #endif #endif /* WITH_MPI */ else ! useGPU #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Recv(row, l_nev, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ ! row(1:l_nev) = row(1:l_nev) #endif call unpack_row_& &MATH_DATATYPE& &_cpu_& &PRECISION & (obj,aIntern, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width) endif ! useGPU #endif /* WITH_OPENMP */ endif enddo endif enddo if (useGPU) then ! Force an unpacking of all remaining rows that haven't been unpacked yet call unpack_and_prepare_row_group_& &MATH_DATATYPE& &_gpu_& &PRECISION& &( & row_group, row_group_dev, aIntern_dev, stripe_count, & stripe_width, last_stripe_width, & a_dim2, l_nev, row_group_size, nblk, unpack_idx, & -1, .true.) successCUDA = cuda_devicesynchronize() if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaDeviceSynchronize"//errorMessage stop 1 endif endif ! Set up result buffer queue num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows num_result_buffers = 4*nfact allocate(result_buffer(l_nev,nblk,num_result_buffers), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when allocating result_buffer"//errorMessage stop 1 endif allocate(result_send_request(num_result_buffers), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when allocating result_send_request"//errorMessage stop 1 endif allocate(result_recv_request(num_result_buffers), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when allocating result_recv_request"//errorMessage stop 1 endif #ifdef WITH_MPI result_send_request(:) = MPI_REQUEST_NULL result_recv_request(:) = MPI_REQUEST_NULL #endif ! Queue up buffers #ifdef WITH_MPI call obj%timer%start("mpi_communication") if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends do j = 1, min(num_result_buffers, num_result_blocks) call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif 0, result_recv_tag, mpi_comm_rows, result_recv_request(j), mpierr) enddo endif call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ ! carefull the "recv" has to be done at the corresponding wait or send ! result_buffer(1: l_nev*nblk,1,j) =result_buffer(1:l_nev*nblk,1,nbuf) #endif /* WITH_MPI */ num_bufs_recvd = 0 ! No buffers received yet ! Initialize top/bottom requests allocate(top_send_request(stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MPI_DATATYPE& &: error when allocating top_send_request"//errorMessage stop 1 endif allocate(top_recv_request(stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when allocating top_recv_request"//errorMessage stop 1 endif allocate(bottom_send_request(stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when allocating bottom_send_request"//errorMessage stop 1 endif allocate(bottom_recv_request(stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when allocating bottom_recv_request"//errorMessage stop 1 endif #ifdef WITH_MPI top_send_request(:) = MPI_REQUEST_NULL top_recv_request(:) = MPI_REQUEST_NULL bottom_send_request(:) = MPI_REQUEST_NULL bottom_recv_request(:) = MPI_REQUEST_NULL #endif #ifdef WITH_OPENMP allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when allocating top_border_send_buffer"//errorMessage stop 1 endif allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when allocating top_border_recv_buffer"//errorMessage stop 1 endif allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when allocating bottom_border_send_buffer"//errorMessage stop 1 endif allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when allocating bottom_border_recv_buffer"//errorMessage stop 1 endif #if REALCASE == 1 top_border_send_buffer(:,:) = CONST_0_0 top_border_recv_buffer(:,:) = CONST_0_0 bottom_border_send_buffer(:,:) = CONST_0_0 bottom_border_recv_buffer(:,:) = CONST_0_0 #endif #if COMPLEXCASE == 1 top_border_send_buffer(:,:) = CONST_COMPLEX_0_0 top_border_recv_buffer(:,:) = CONST_COMPLEX_0_0 bottom_border_send_buffer(:,:) = CONST_COMPLEX_0_0 bottom_border_recv_buffer(:,:) = CONST_COMPLEX_0_0 #endif ! Initialize broadcast buffer #else /* WITH_OPENMP */ allocate(top_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when allocating top_border_send_bufer"//errorMessage stop 1 endif allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when allocating top_border_recv_buffer"//errorMessage stop 1 endif allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when allocating bottom_border_send_buffer"//errorMessage stop 1 endif allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when allocating bottom_border_recv_buffer"//errorMessage stop 1 endif #if REALCASE == 1 top_border_send_buffer(:,:,:) = CONST_0_0 top_border_recv_buffer(:,:,:) = CONST_0_0 bottom_border_send_buffer(:,:,:) = CONST_0_0 bottom_border_recv_buffer(:,:,:) = CONST_0_0 #endif #if COMPLEXCASE == 1 top_border_send_buffer(:,:,:) = CONST_COMPLEX_0_0 top_border_recv_buffer(:,:,:) = CONST_COMPLEX_0_0 bottom_border_send_buffer(:,:,:) = CONST_COMPLEX_0_0 bottom_border_recv_buffer(:,:,:) = CONST_COMPLEX_0_0 #endif #endif /* WITH_OPENMP */ ! Initialize broadcast buffer allocate(bcast_buffer(nbw, max_blk_size), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when allocating bcast_buffer"//errorMessage stop 1 endif #if REALCASE == 1 bcast_buffer = CONST_0_0 #endif #if COMPLEXCASE == 1 bcast_buffer = 0 #endif if (useGPU) then num = ( nbw * max_blk_size) * size_of_datatype successCUDA = cuda_malloc(bcast_buffer_dev, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaMalloc" stop 1 endif successCUDA = cuda_memset( bcast_buffer_dev, 0, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaMemset" stop 1 endif num = ((max_blk_size-1))* size_of_datatype successCUDA = cuda_malloc( hh_dot_dev, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaMalloc" stop 1 endif successCUDA = cuda_memset( hh_dot_dev, 0, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaMemset" stop 1 endif num = (max_blk_size)* size_of_datatype successCUDA = cuda_malloc( hh_tau_dev, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaMalloc" stop 1 endif successCUDA = cuda_memset( hh_tau_dev, 0, num) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaMemset" stop 1 endif endif ! useGPU current_tv_off = 0 ! Offset of next row to be broadcast ! ------------------- start of work loop ------------------- a_off = 0 ! offset in aIntern (to avoid unnecessary shifts) top_msg_length = 0 bottom_msg_length = 0 do sweep = 0, (na-1)/nbw current_n = na - sweep*nbw call determine_workload(obj,current_n, nbw, np_rows, limits) current_n_start = limits(my_prow) current_n_end = limits(my_prow+1) current_local_n = current_n_end - current_n_start next_n = max(current_n - nbw, 0) call determine_workload(obj,next_n, nbw, np_rows, limits) next_n_start = limits(my_prow) next_n_end = limits(my_prow+1) next_local_n = next_n_end - next_n_start if (next_n_end < next_n) then bottom_msg_length = current_n_end - next_n_end else bottom_msg_length = 0 endif if (next_local_n > 0) then next_top_msg_length = current_n_start - next_n_start else next_top_msg_length = 0 endif if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then #ifdef WITH_MPI call obj%timer%start("mpi_communication") #endif do i = 1, stripe_count #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop 1 endif csw = min(stripe_width, thread_width-(i-1)*stripe_width) ! "current_stripe_width" b_len = csw*nbw*max_threads #ifdef WITH_MPI call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #else /* WITH_MPI */ ! carefull the "recieve" has to be done at the corresponding wait or send ! bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) #endif /* WITH_MPI */ #else /* WITH_OPENMP */ #ifdef WITH_MPI call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #else /* WITH_MPI */ ! carefull the recieve has to be done at the corresponding wait or send ! bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i) #endif /* WITH_MPI */ #endif /* WITH_OPENMP */ enddo #if WITH_MPI call obj%timer%stop("mpi_communication") #endif endif if (current_local_n > 1) then if (my_pcol == mod(sweep,np_cols)) then bcast_buffer(:,1:current_local_n) = & hh_trans(:,current_tv_off+1:current_tv_off+current_local_n) current_tv_off = current_tv_off + current_local_n endif #ifdef WITH_MPI call obj%timer%start("mpi_communication") call mpi_bcast(bcast_buffer, nbw*current_local_n, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif mod(sweep,np_cols), mpi_comm_cols, mpierr) call obj%timer%stop("mpi_communication") #endif /* WITH_MPI */ if (useGPU) then successCUDA = cuda_memcpy(bcast_buffer_dev, loc(bcast_buffer(1,1)), & nbw * current_local_n * & size_of_datatype, & cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaMemcpy" stop 1 endif call extract_hh_tau_& &MATH_DATATYPE& &_gpu_& &PRECISION & !#if REALCASE == 1 (bcast_buffer_dev, hh_tau_dev, nbw, & !#endif !#if COMPLEXCASE == 1 ! ( nbw, & !#endif current_local_n, .false.) call compute_hh_dot_products_& &MATH_DATATYPE& &_gpu_& &PRECISION & (bcast_buffer_dev, hh_dot_dev, nbw, & current_local_n) endif ! useGPU else ! (current_local_n > 1) then ! for current_local_n == 1 the one and only HH Vector is 0 and not stored in hh_trans_real/complex #if REALCASE == 1 bcast_buffer(:,1) = CONST_0_0 #endif #if COMPLEXCASE == 1 bcast_buffer(:,1) = CONST_COMPLEX_0_0 #endif if (useGPU) then successCUDA = cuda_memset(bcast_buffer_dev, 0, nbw * size_of_datatype) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaMemset" stop 1 endif call extract_hh_tau_& &MATH_DATATYPE& &_gpu_& &PRECISION& &( & bcast_buffer_dev, hh_tau_dev, & nbw, 1, .true.) endif ! useGPU endif ! (current_local_n > 1) then if (l_nev == 0) cycle if (current_local_n > 0) then do i = 1, stripe_count #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop 1 endif ! Get real stripe width for strip i; ! The last OpenMP tasks may have an even smaller stripe with, ! but we don't care about this, i.e. we send/recv a bit too much in this case. ! csw: current_stripe_width csw = min(stripe_width, thread_width-(i-1)*stripe_width) #endif /* WITH_OPENMP */ !wait_b if (current_n_end < current_n) then #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop 1 endif #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #endif #if REALCASE == 1 call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX) !$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1) do my_thread = 1, max_threads n_off = current_local_n+a_off b_len = csw*nbw b_off = (my_thread-1)*b_len aIntern(1:csw,n_off+1:n_off+nbw,i,my_thread) = & reshape(bottom_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, nbw /)) enddo !$omp end parallel do call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX) #endif #if COMPLEXCASE == 1 call obj%timer%start("OpenMP parallel_PRECISION") !$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1) do my_thread = 1, max_threads n_off = current_local_n+a_off b_len = csw*nbw b_off = (my_thread-1)*b_len aIntern(1:csw,n_off+1:n_off+nbw,i,my_thread) = & reshape(bottom_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, nbw /)) enddo !$omp end parallel do call obj%timer%stop("OpenMP parallel_PRECISION") #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #endif n_off = current_local_n+a_off if (useGPU) then dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * size_of_datatype successCUDA = cuda_memcpy( aIntern_dev + dev_offset , loc(bottom_border_recv_buffer(1,1,i)), & stripe_width*nbw* size_of_datatype, & cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaMemcpy" stop 1 endif else aIntern(:,n_off+1:n_off+nbw,i) = bottom_border_recv_buffer(:,1:nbw,i) endif #endif /* WITH_OPENMP */ if (next_n_end < next_n) then #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop 1 endif #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) call obj%timer%stop("mpi_communication") #else /* WTIH_MPI */ ! carefull the recieve has to be done at the corresponding wait or send ! bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) #endif /* WITH_MPI */ #else /* WITH_OPENMP */ #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_prow+1, bottom_recv_tag, mpi_comm_rows, bottom_recv_request(i), mpierr) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ !! carefull the recieve has to be done at the corresponding wait or send !! bottom_border_recv_buffer(1:stripe_width,1:nbw,i) = top_border_send_buffer(1:stripe_width,1:nbw,i) #endif /* WITH_MPI */ #endif /* WITH_OPENMP */ endif endif if (current_local_n <= bottom_msg_length + top_msg_length) then !wait_t if (top_msg_length>0) then #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: not yet implemented" stop 1 endif #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #endif if (useGPU) then dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype ! host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ) ) * 8 successCUDA = cuda_memcpy( aIntern_dev+dev_offset , loc(top_border_recv_buffer(1,1,i)), & stripe_width*top_msg_length* size_of_datatype, & cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaMemcpy" stop 1 endif else ! useGPU aIntern(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i) endif ! useGPU #endif /* WITH_OPENMP */ endif ! top_msg_length !compute #ifdef WITH_OPENMP call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX) !$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1) do my_thread = 1, max_threads if (top_msg_length>0) then b_len = csw*top_msg_length b_off = (my_thread-1)*b_len aIntern(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = & reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /)) endif call compute_hh_trafo_& &MATH_DATATYPE& &_openmp_& &PRECISION & (obj,aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, & l_nev, a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, & #if REALCASE == 1 hh_dot_dev, & #endif hh_tau_dev, kernel_flops, kernel_time, n_times, 0, current_local_n, & i, my_thread, thread_width, kernel) enddo !$omp end parallel do call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX) #else /* WITH_OPENMP */ call compute_hh_trafo_& &MATH_DATATYPE& &_& &PRECISION& & (obj,aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, & #if REALCASE == 1 hh_dot_dev, & #endif hh_tau_dev, kernel_flops, kernel_time, n_times, 0, current_local_n, i, & last_stripe_width, kernel) !#if COMPLEXCASE == 1 ! if (useGPU) then ! call compute_hh_trafo_complex_gpu_& ! &PRECISION& ! &(aIntern_dev, bcast_buffer_dev, hh_tau_dev, 0, current_local_n, i, a_off, dev_offset, dev_offset_1, & ! dev_offset_2, a_dim2, & ! kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width) ! else ! call compute_hh_trafo_complex_& ! &PRECISION& ! &(aIntern, stripe_width, a_dim2, stripe_count, & ! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & ! 0, current_local_n, i, last_stripe_width, & ! kernel) ! endif !#endif /* COMPLEXCASE */ #endif /* WITH_OPENMP */ !send_b 1 #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #endif if (bottom_msg_length>0) then n_off = current_local_n+nbw-bottom_msg_length+a_off #ifdef WITH_OPENMP b_len = csw*bottom_msg_length*max_threads bottom_border_send_buffer(1:b_len,i) = & reshape(aIntern(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /)) #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Isend(bottom_border_send_buffer(1,i), b_len, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_prow+1, top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ if (next_top_msg_length > 0) then top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = bottom_border_send_buffer(1:csw* & next_top_msg_length*max_threads,i) endif #endif /* WITH_MPI */ !#if REALCASE == 1 endif ! this endif is not here in complex -case is for bottom_msg_length !#endif #else /* WITH_OPENMP */ if (useGPU) then dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), aIntern_dev + dev_offset, & stripe_width * bottom_msg_length * size_of_datatype, & cudaMemcpyDeviceToHost) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaMemcpy" stop 1 endif else bottom_border_send_buffer(:,1:bottom_msg_length,i) = aIntern(:,n_off+1:n_off+bottom_msg_length,i) endif #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_prow+1, top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ if (next_top_msg_length > 0) then top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) = & bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i) endif #endif /* WITH_MPI */ endif #endif /* WITH_OPENMP */ else ! current_local_n <= bottom_msg_length + top_msg_length !compute #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop 1 endif call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX) !$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1) do my_thread = 1, max_threads call compute_hh_trafo_& &MATH_DATATYPE& &_openmp_& &PRECISION& & (obj,aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, & nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, & #if REALCASE == 1 hh_dot_dev, & #endif hh_tau_dev, kernel_flops, kernel_time, n_times, current_local_n - bottom_msg_length, & bottom_msg_length, i, my_thread, thread_width, kernel) enddo !$omp end parallel do call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX) !send_b #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #endif if (bottom_msg_length > 0) then n_off = current_local_n+nbw-bottom_msg_length+a_off b_len = csw*bottom_msg_length*max_threads bottom_border_send_buffer(1:b_len,i) = & reshape(aIntern(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /)) #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Isend(bottom_border_send_buffer(1,i), b_len, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_prow+1, top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ if (next_top_msg_length > 0) then top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = bottom_border_send_buffer(1:csw* & next_top_msg_length*& max_threads,i) endif #endif /* WITH_MPI */ endif #else /* WITH_OPENMP */ call compute_hh_trafo_& &MATH_DATATYPE& &_& &PRECISION& & (obj,aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, & #if REALCASE == 1 hh_dot_dev, & #endif hh_tau_dev, kernel_flops, kernel_time, n_times, & current_local_n - bottom_msg_length, bottom_msg_length, i, & last_stripe_width, kernel) !#if COMPLEXCASE == 1 !! the complex case and real case diverged here ! if (useGPU) then ! call compute_hh_trafo_complex_gpu_& ! &PRECISION& ! &(aIntern_dev, bcast_buffer_dev, hh_tau_dev, current_local_n -bottom_msg_length, bottom_msg_length, i, a_off, & ! dev_offset, dev_offset_1, dev_offset_2, & ! a_dim2, & ! kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width) ! else ! call compute_hh_trafo_complex_& ! &PRECISION& ! &(aIntern, stripe_width, a_dim2, stripe_count, & ! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & ! current_local_n - bottom_msg_length, bottom_msg_length, i, & ! last_stripe_width, kernel) ! ! endif ! !#endif !send_b #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #endif if (bottom_msg_length > 0) then n_off = current_local_n+nbw-bottom_msg_length+a_off if (useGPU) then dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), aIntern_dev + dev_offset, & stripe_width*bottom_msg_length* size_of_datatype, & cudaMemcpyDeviceToHost) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error cudaMemcpy" stop 1 endif else bottom_border_send_buffer(:,1:bottom_msg_length,i) = aIntern(:,n_off+1:n_off+bottom_msg_length,i) endif #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_prow+1, top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ if (next_top_msg_length > 0) then top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) = & bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i) endif #endif /* WITH_MPI */ #if REALCASE == 1 endif #endif #endif /* WITH_OPENMP */ #ifndef WITH_OPENMP #if COMPLEXCASE == 1 endif #endif #endif !compute #ifdef WITH_OPENMP call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX) !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads call compute_hh_trafo_& &MATH_DATATYPE& &_openmp_& &PRECISION& & (obj,aIntern, aIntern_dev, stripe_width ,a_dim2, stripe_count, max_threads, l_nev, a_off, & nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, & #if REALCASE == 1 hh_dot_dev, & #endif hh_tau_dev, kernel_flops, kernel_time, n_times, top_msg_length,& current_local_n-top_msg_length-bottom_msg_length, i, my_thread, thread_width, & kernel) enddo !$omp end parallel do call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX) #else /* WITH_OPENMP */ call compute_hh_trafo_& &MATH_DATATYPE& &_& &PRECISION& & (obj,aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, & #if REALCASE == 1 hh_dot_dev, & #endif hh_tau_dev, kernel_flops, kernel_time, n_times, top_msg_length, & current_local_n-top_msg_length-bottom_msg_length, i, & last_stripe_width, kernel) !#if COMPLEXCASE == 1 ! if (useGPU) then ! call compute_hh_trafo_complex_gpu_& ! &PRECISION& ! &(aIntern_dev, bcast_buffer_dev, hh_tau_dev, top_msg_length,current_local_n-top_msg_length-bottom_msg_length, & ! i, a_off, dev_offset, dev_offset_1, dev_offset_2, & ! a_dim2, & ! kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width) ! else ! call compute_hh_trafo_complex_& ! &PRECISION& ! &(aIntern, stripe_width, a_dim2, stripe_count, & ! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & ! top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, & ! last_stripe_width, kernel) ! endif !#endif /* COMPLEXCASE */ #endif /* WITH_OPENMP */ !wait_t if (top_msg_length>0) then #ifdef WITH_OPENMP #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #endif if (useGPU) then dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype successCUDA = cuda_memcpy( aIntern_dev + dev_offset , loc( top_border_recv_buffer(:,1,i)), & stripe_width * top_msg_length * size_of_datatype, & cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaMemcpy" stop 1 endif else aIntern(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i) endif #endif /* WITH_OPENMP */ endif !compute #ifdef WITH_OPENMP call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX) !$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1) do my_thread = 1, max_threads if (top_msg_length>0) then b_len = csw*top_msg_length b_off = (my_thread-1)*b_len aIntern(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = & reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /)) endif call compute_hh_trafo_& &MATH_DATATYPE& &_openmp_& &PRECISION& & (obj,aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, & nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, & #if REALCASE == 1 hh_dot_dev, & #endif hh_tau_dev, kernel_flops, kernel_time, n_times, 0, top_msg_length, i, my_thread, & thread_width, kernel) enddo !$omp end parallel do call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX) #else /* WITH_OPENMP */ call compute_hh_trafo_& &MATH_DATATYPE& &_& &PRECISION& & (obj,aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, & #if REALCASE == 1 hh_dot_dev, & #endif hh_tau_dev, kernel_flops, kernel_time, n_times, 0, top_msg_length, i, & last_stripe_width, kernel) !#if COMPLEXCASE == 1 ! if (useGPU) then ! call compute_hh_trafo_complex_gpu_& ! &PRECISION& ! &(aIntern_dev, bcast_buffer_dev, hh_tau_dev, 0, top_msg_length, i, a_off, dev_offset, dev_offset_1, dev_offset_2, & ! a_dim2, & ! kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width) ! else ! call compute_hh_trafo_complex_& ! &PRECISION& ! &(aIntern, stripe_width, a_dim2, stripe_count, & ! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & ! 0, top_msg_length, i, last_stripe_width, & ! kernel) ! endif !#endif #endif /* WITH_OPENMP */ endif if (next_top_msg_length > 0) then !request top_border data #ifdef WITH_OPENMP b_len = csw*next_top_msg_length*max_threads #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Irecv(top_border_recv_buffer(1,i), b_len, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_prow-1, top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ ! carefull the "recieve" has to be done at the corresponding wait or send ! top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = & ! bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i) #endif /* WITH_MPI */ #else /* WITH_OPENMP */ #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_prow-1, top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ ! carefull the "recieve" has to be done at the corresponding wait or send ! top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) = & ! bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i) #endif /* WITH_MPI */ #endif /* WITH_OPENMP */ endif !send_t if (my_prow > 0) then #ifdef WITH_OPENMP #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #endif b_len = csw*nbw*max_threads top_border_send_buffer(1:b_len,i) = reshape(aIntern(1:csw,a_off+1:a_off+nbw,i,:), (/ b_len /)) #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Isend(top_border_send_buffer(1,i), b_len, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_prow-1, bottom_recv_tag, mpi_comm_rows, top_send_request(i), mpierr) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) endif if (next_n_end < next_n) then bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) endif #endif /* WITH_MPI */ #else /* WITH_OPENMP */ #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #endif if (useGPU) then dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype successCUDA = cuda_memcpy( loc(top_border_send_buffer(:,1,i)), aIntern_dev + dev_offset, & stripe_width*nbw * size_of_datatype, & cudaMemcpyDeviceToHost) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaMemcpy" stop 1 endif else top_border_send_buffer(:,1:nbw,i) = aIntern(:,a_off+1:a_off+nbw,i) endif #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif my_prow-1, bottom_recv_tag, mpi_comm_rows, top_send_request(i), mpierr) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i) endif if (next_n_end < next_n) then bottom_border_recv_buffer(1:stripe_width,1:nbw,i) = top_border_send_buffer(1:stripe_width,1:nbw,i) endif #endif /* WITH_MPI */ #endif /* WITH_OPENMP */ endif ! Care that there are not too many outstanding top_recv_request's if (stripe_count > 1) then if (i>1) then #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #endif else #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #endif endif endif enddo top_msg_length = next_top_msg_length else ! wait for last top_send_request #ifdef WITH_MPI do i = 1, stripe_count call obj%timer%start("mpi_communication") call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") enddo #endif endif ! Care about the result if (my_prow == 0) then ! topmost process sends nbw rows to destination processes do j=0, nfact-1 num_blk = sweep*nfact+j ! global number of destination block, 0 based if (num_blk*nblk >= na) exit nbuf = mod(num_blk, num_result_buffers) + 1 ! buffer number to get this block #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #endif dst = mod(num_blk, np_rows) if (dst == 0) then if (useGPU) then row_group_size = min(na - num_blk*nblk, nblk) call pack_row_group_& &MATH_DATATYPE& &_gpu_& &PRECISION& &( & row_group_dev, aIntern_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, & row_group(:, :), j * nblk + a_off, row_group_size) do i = 1, row_group_size q((num_blk / np_rows) * nblk + i, 1 : l_nev) = row_group(:, i) enddo else ! useGPU do i = 1, min(na - num_blk*nblk, nblk) #ifdef WITH_OPENMP #if REALCASE == 1 call pack_row_real_cpu_openmp_& #endif #if COMPLEXCASE == 1 call pack_row_complex_cpu_openmp_& #endif &PRECISION& &(obj,aIntern, row, j*nblk+i+a_off, stripe_width, stripe_count, max_threads, thread_width, l_nev) #else /* WITH_OPENMP */ #if REALCASE == 1 call pack_row_real_cpu_& #endif #if COMPLEXCASE == 1 call pack_row_complex_cpu_& #endif &PRECISION& &(obj,aIntern, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count) #endif /* WITH_OPENMP */ q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:) enddo endif ! useGPU else ! (dst == 0) if (useGPU) then call pack_row_group_& &MATH_DATATYPE& &_gpu_& &PRECISION& &( & row_group_dev, aIntern_dev, stripe_count, stripe_width, & last_stripe_width, a_dim2, l_nev, & result_buffer(:, :, nbuf), j * nblk + a_off, nblk) else ! useGPU do i = 1, nblk #if WITH_OPENMP #if REALCASE == 1 call pack_row_real_cpu_openmp_& #endif #if COMPLEXCASE == 1 call pack_row_complex_cpu_openmp_& #endif &PRECISION& &(obj,aIntern, result_buffer(:,i,nbuf), j*nblk+i+a_off, stripe_width, stripe_count, & max_threads, thread_width, l_nev) #else /* WITH_OPENMP */ #if REALCASE == 1 call pack_row_real_cpu_& #endif #if COMPLEXCASE == 1 call pack_row_complex_cpu_& #endif &PRECISION& &(obj, aIntern, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count) #endif /* WITH_OPENMP */ enddo endif ! useGPU #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif dst, result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ if (j+num_result_buffers < num_result_blocks) & result_buffer(1:l_nev,1:nblk,nbuf) = result_buffer(1:l_nev,1:nblk,nbuf) if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends do j1 = 1, min(num_result_buffers, num_result_blocks) result_buffer(1:l_nev,1:nblk,j1) = result_buffer(1:l_nev,1:nblk,nbuf) enddo endif #endif /* WITH_MPI */ endif ! (dst == 0) enddo !j=0, nfact-1 else ! (my_prow == 0) ! receive and store final result do j = num_bufs_recvd, num_result_blocks-1 nbuf = mod(j, num_result_buffers) + 1 ! buffer number to get this block ! If there is still work to do, just test for the next result request ! and leave the loop if it is not ready, otherwise wait for all ! outstanding requests if (next_local_n > 0) then #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ flag = .true. #endif if (.not.flag) exit else ! (next_local_n > 0) #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #endif endif ! (next_local_n > 0) ! Fill result buffer into q num_blk = j*np_rows + my_prow ! global number of current block, 0 based do i = 1, min(na - num_blk*nblk, nblk) q(j*nblk+i, 1:l_nev) = result_buffer(1:l_nev, i, nbuf) enddo ! Queue result buffer again if there are outstanding blocks left #ifdef WITH_MPI call obj%timer%start("mpi_communication") if (j+num_result_buffers < num_result_blocks) & call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, & #if REALCASE == 1 MPI_REAL_PRECISION, & #endif #if COMPLEXCASE == 1 MPI_COMPLEX_EXPLICIT_PRECISION, & #endif 0, result_recv_tag, mpi_comm_rows, result_recv_request(nbuf), mpierr) ! carefull the "recieve" has to be done at the corresponding wait or send ! if (j+num_result_buffers < num_result_blocks) & ! result_buffer(1:l_nev*nblk,1,nbuf) = result_buffer(1:l_nev*nblk,1,nbuf) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ #endif /* WITH_MPI */ enddo ! j = num_bufs_recvd, num_result_blocks-1 num_bufs_recvd = j endif ! (my_prow == 0) ! Shift the remaining rows to the front of aIntern (if necessary) offset = nbw - top_msg_length if (offset<0) then if (wantDebug) write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_& &MATH_DATATYPE& &: internal error, offset for shifting = ',offset success = .false. return endif a_off = a_off + offset if (a_off + next_local_n + nbw > a_dim2) then #ifdef WITH_OPENMP if (useGPU) then print *,"trans_ev_tridi_to_band_real: not yet implemented" stop 1 endif call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX) !$omp parallel do private(my_thread, i, j), schedule(static, 1) do my_thread = 1, max_threads do i = 1, stripe_count do j = top_msg_length+1, top_msg_length+next_local_n aIntern(:,j,i,my_thread) = aIntern(:,j+a_off,i,my_thread) enddo enddo enddo !$omp end parallel do call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX) #else /* WITH_OPENMP */ do i = 1, stripe_count if (useGPU) then chunk = min(next_local_n - 1, a_off) do j = top_msg_length + 1, top_msg_length + next_local_n, chunk top = min(j + chunk, top_msg_length + next_local_n) this_chunk = top - j + 1 dev_offset = (0 + ( (j-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype dev_offset_1 = (0 + ( (j + a_off-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype ! it is not logical to set here always the value for the parameter ! "cudaMemcpyDeviceToDevice" do this ONCE at startup ! tmp = cuda_d2d(1) successCUDA = cuda_memcpy( aIntern_dev + dev_offset , aIntern_dev +dev_offset_1, & stripe_width*this_chunk* size_of_datatype, cudaMemcpyDeviceToDevice) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error cudaMemcpy" stop 1 endif enddo else ! not useGPU do j = top_msg_length+1, top_msg_length+next_local_n aIntern(:,j,i) = aIntern(:,j+a_off,i) enddo endif enddo ! stripe_count #endif /* WITH_OPENMP */ #ifdef WITH_OPENMP call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX) #endif /* WITH_OPENMP */ a_off = 0 endif enddo ! Just for safety: #ifdef WITH_MPI if (ANY(top_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_send_request ***',my_prow,my_pcol if (ANY(bottom_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_send_request ***',my_prow,my_pcol if (ANY(top_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_recv_request ***',my_prow,my_pcol if (ANY(bottom_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_recv_request ***',my_prow,my_pcol #endif if (my_prow == 0) then #ifdef WITH_MPI call obj%timer%start("mpi_communication") call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr) call obj%timer%stop("mpi_communication") #endif endif #ifdef WITH_MPI if (ANY(result_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_send_request ***',my_prow,my_pcol if (ANY(result_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_recv_request ***',my_prow,my_pcol #ifdef HAVE_DETAILED_TIMINGS call MPI_ALLREDUCE(kernel_flops, kernel_flops_recv, 1, MPI_INTEGER8, MPI_SUM, MPI_COMM_ROWS, mpierr) kernel_flops = kernel_flops_recv call MPI_ALLREDUCE(kernel_flops, kernel_flops_recv, 1, MPI_INTEGER8, MPI_SUM, MPI_COMM_COLS, mpierr) kernel_flops = kernel_flops_recv call MPI_ALLREDUCE(kernel_time, kernel_time_recv, 1, MPI_REAL8, MPI_MAX, MPI_COMM_ROWS, mpierr) kernel_time_recv = kernel_time call MPI_ALLREDUCE(kernel_time, kernel_time_recv, 1, MPI_REAL8, MPI_MAX, MPI_COMM_COLS, mpierr) kernel_time_recv = kernel_time #endif #else /* WITH_MPI */ call obj%get("print_flops",print_flops) if (my_prow==0 .and. my_pcol==0 .and.print_flops == 1) & write(error_unit,'(" Kernel time:",f10.3," MFlops: ",es12.5)') kernel_time, kernel_flops/kernel_time*1.d-6 #endif /* WITH_MPI */ if (useGPU) then !#if REALCASE == 1 ! copy q to q_dev needed in trans_ev_band_to_full successCUDA = cuda_malloc(q_dev, ldq*matrixCols* size_of_datatype) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaMalloc" stop 1 endif ! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)* size_of_datatype, & cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaMalloc" stop 1 endif ! endif !#endif endif !use GPU ! deallocate all working space if (.not.(useGPU)) then nullify(aIntern) call free(aIntern_ptr) ! deallocate(aIntern, stat=istat, errmsg=errorMessage) ! if (istat .ne. 0) then ! print *,"trans_ev_tridi_to_band_real: error when deallocating aIntern "//errorMessage ! stop 1 ! endif endif deallocate(row, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when deallocating row "//errorMessage stop 1 endif deallocate(limits, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when deallocating limits"//errorMessage stop 1 endif deallocate(result_send_request, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when deallocating result_send_request "//errorMessage stop 1 endif deallocate(result_recv_request, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when deallocating result_recv_request "//errorMessage stop 1 endif deallocate(top_border_send_buffer, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when deallocating top_border_send_buffer "//errorMessage stop 1 endif deallocate(top_border_recv_buffer, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when deallocating top_border_recv_buffer "//errorMessage stop 1 endif deallocate(bottom_border_send_buffer, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when deallocating bottom_border_send_buffer "//errorMessage stop 1 endif deallocate(bottom_border_recv_buffer, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when deallocating bottom_border_recv_buffer "//errorMessage stop 1 endif deallocate(result_buffer, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when deallocating result_buffer "//errorMessage stop 1 endif deallocate(bcast_buffer, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when deallocating bcast_buffer "//errorMessage stop 1 endif deallocate(top_send_request, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when deallocating top_send_request "//errorMessage stop 1 endif deallocate(top_recv_request, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when deallocating top_recv_request "//errorMessage stop 1 endif deallocate(bottom_send_request, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when deallocating bottom_send_request "//errorMessage stop 1 endif deallocate(bottom_recv_request, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when deallocating bottom_recv_request "//errorMessage stop 1 endif if (useGPU) then #if COMPLEXCASE == 1 successCUDA = cuda_free(aIntern_dev) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_complex: error in cudaFree" stop 1 endif #endif successCUDA = cuda_free(hh_dot_dev) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &real: error in cudaFree "//errorMessage stop 1 endif successCUDA = cuda_free(hh_tau_dev) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaFree "//errorMessage stop 1 endif successCUDA = cuda_free(row_dev) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaFree "//errorMessage stop 1 endif deallocate(row_group, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error when deallocating row_group "//errorMessage stop 1 endif successCUDA = cuda_free(row_group_dev) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaFree "//errorMessage stop 1 endif successCUDA = cuda_free(bcast_buffer_dev) if (.not.(successCUDA)) then print *,"trans_ev_tridi_to_band_& &MATH_DATATYPE& &: error in cudaFree "//errorMessage stop 1 endif endif ! useGPU call obj%timer%stop("trans_ev_tridi_to_band_& &MATH_DATATYPE& &" // & &PRECISION_SUFFIX& ) return !#if COMPLEXCASE == 1 ! contains ! ! The host wrapper for extracting "tau" from the HH reflectors (see the ! ! kernel below) ! subroutine extract_hh_tau_complex_gpu_& ! &PRECISION& ! &(nbw, n, is_zero) ! use cuda_c_kernel ! use pack_unpack_gpu ! use precision ! implicit none ! integer(kind=ik), value :: nbw, n ! logical, value :: is_zero ! integer(kind=ik) :: val_is_zero ! ! if (is_zero) then ! val_is_zero = 1 ! else ! val_is_zero = 0 ! endif ! call launch_extract_hh_tau_c_kernel_complex_& ! &PRECISION& ! &(bcast_buffer_dev,hh_tau_dev, nbw, n,val_is_zero) ! end subroutine !#endif /* COMPLEXCASE */ end subroutine ! vim: syntax=fortran