Commit c79d2841 authored by Andreas Marek's avatar Andreas Marek
Browse files

New GPU coda path in trans_ev_tridi also for OpenMP only

parent 899f8870
......@@ -105,150 +105,159 @@ subroutine trans_ev_tridi_to_band_&
#endif
implicit none
#include "../general/precision_kinds.F90"
class(elpa_abstract_impl_t), intent(inout) :: obj
logical, intent(in) :: useGPU
class(elpa_abstract_impl_t), intent(inout) :: obj
logical, intent(in) :: useGPU
integer(kind=ik), intent(in) :: kernel
integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
integer(kind=ik), intent(in) :: kernel
integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck), target :: q(ldq,*)
MATH_DATATYPE(kind=rck), target :: q(ldq,*)
#else
MATH_DATATYPE(kind=rck), target :: q(ldq,matrixCols)
MATH_DATATYPE(kind=rck), target :: q(ldq,matrixCols)
#endif
MATH_DATATYPE(kind=rck), intent(in) :: hh_trans(:,:)
MATH_DATATYPE(kind=rck), intent(in),target :: hh_trans(:,:)
integer(kind=c_intptr_t) :: hh_trans_dev
type(c_ptr) :: hh_trans_mpi_dev
MATH_DATATYPE(kind=rck), pointer :: hh_trans_mpi_fortran_ptr(:,:)
integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol
integer(kind=MPI_KIND) :: np_rowsMPI, my_prowMPI, np_colsMPI, my_pcolMPI
integer(kind=ik) :: i, j, ip, sweep, nbuf, l_nev, a_dim2
integer(kind=ik) :: current_n, current_local_n, current_n_start, current_n_end
integer(kind=ik) :: next_n, next_local_n, next_n_start, next_n_end
integer(kind=ik) :: bottom_msg_length, top_msg_length, next_top_msg_length
integer(kind=ik) :: stripe_width, last_stripe_width, stripe_count
integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol
integer(kind=MPI_KIND) :: np_rowsMPI, my_prowMPI, np_colsMPI, my_pcolMPI
integer(kind=ik) :: i, j, ip, sweep, nbuf, l_nev, a_dim2
integer(kind=ik) :: current_n, current_local_n, current_n_start, current_n_end
integer(kind=ik) :: next_n, next_local_n, next_n_start, next_n_end
integer(kind=ik) :: bottom_msg_length, top_msg_length, next_top_msg_length
integer(kind=ik) :: stripe_width, last_stripe_width, stripe_count
#ifdef WITH_OPENMP_TRADITIONAL
integer(kind=ik) :: thread_width, thread_width2, csw, b_off, b_len
integer(kind=ik) :: thread_width, thread_width2, csw, b_off, b_len
#endif
integer(kind=ik) :: num_result_blocks, num_result_buffers, num_bufs_recvd
integer(kind=ik) :: a_off, current_tv_off, max_blk_size
integer(kind=ik) :: src, src_offset, dst, offset, nfact, num_blk
integer(kind=MPI_KIND) :: mpierr
integer(kind=ik) :: num_result_blocks, num_result_buffers, num_bufs_recvd
integer(kind=ik) :: a_off, current_tv_off, max_blk_size
integer(kind=ik) :: src, src_offset, dst, offset, nfact, num_blk
integer(kind=MPI_KIND) :: mpierr
logical :: flag
logical :: flag
#ifdef WITH_OPENMP_TRADITIONAL
MATH_DATATYPE(kind=rck), pointer :: aIntern(:,:,:,:)
MATH_DATATYPE(kind=rck), pointer :: aIntern(:,:,:,:)
#else
MATH_DATATYPE(kind=rck), pointer :: aIntern(:,:,:)
MATH_DATATYPE(kind=rck), pointer :: aIntern(:,:,:)
#endif
MATH_DATATYPE(kind=rck) :: a_var
type(c_ptr) :: aIntern_ptr
MATH_DATATYPE(kind=rck), allocatable, target :: row(:)
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
integer(kind=c_intptr_t) :: row_dev
type(c_ptr) :: row_mpi_dev
MATH_DATATYPE(kind=rck), pointer :: row_mpi_fortran_ptr(:)
#endif
MATH_DATATYPE(kind=rck), pointer :: row_group(:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: top_border_send_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: top_border_recv_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: bottom_border_send_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: bottom_border_recv_buffer(:,:)
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
integer(kind=c_intptr_t) :: top_border_recv_buffer_dev, top_border_send_buffer_dev
type(c_ptr) :: top_border_recv_buffer_mpi_dev, top_border_send_buffer_mpi_dev
MATH_DATATYPE(kind=rck), pointer :: top_border_recv_buffer_mpi_fortran_ptr(:,:), &
top_border_send_buffer_mpi_fortran_ptr(:,:)
integer(kind=c_intptr_t) :: bottom_border_send_buffer_dev, bottom_border_recv_buffer_dev
type(c_ptr) :: bottom_border_send_buffer_mpi_dev, bottom_border_recv_buffer_mpi_dev
MATH_DATATYPE(kind=rck), pointer :: bottom_border_send_buffer_mpi_fortran_ptr(:,:), &
bottom_border_recv_buffer_mpi_fortran_ptr(:,:)
type(c_ptr) :: aIntern_mpi_dev
MATH_DATATYPE(kind=rck) :: a_var
type(c_ptr) :: aIntern_ptr
MATH_DATATYPE(kind=rck), allocatable, target :: row(:)
integer(kind=c_intptr_t) :: row_dev
type(c_ptr) :: row_mpi_dev
MATH_DATATYPE(kind=rck), pointer :: row_mpi_fortran_ptr(:)
MATH_DATATYPE(kind=rck), pointer :: row_group(:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: top_border_send_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: top_border_recv_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: bottom_border_send_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable, target :: bottom_border_recv_buffer(:,:)
integer(kind=c_intptr_t) :: top_border_recv_buffer_dev, top_border_send_buffer_dev
type(c_ptr) :: top_border_recv_buffer_mpi_dev, top_border_send_buffer_mpi_dev
MATH_DATATYPE(kind=rck), pointer :: top_border_recv_buffer_mpi_fortran_ptr(:,:), &
top_border_send_buffer_mpi_fortran_ptr(:,:)
integer(kind=c_intptr_t) :: bottom_border_send_buffer_dev, bottom_border_recv_buffer_dev
type(c_ptr) :: bottom_border_send_buffer_mpi_dev, bottom_border_recv_buffer_mpi_dev
MATH_DATATYPE(kind=rck), pointer :: bottom_border_send_buffer_mpi_fortran_ptr(:,:), &
bottom_border_recv_buffer_mpi_fortran_ptr(:,:)
type(c_ptr) :: aIntern_mpi_dev
#ifdef WITH_OPENMP_TRADITIONAL
MATH_DATATYPE(kind=rck), pointer :: aIntern_mpi_fortran_ptr(:,:,:,:)
MATH_DATATYPE(kind=rck), pointer :: aIntern_mpi_fortran_ptr(:,:,:,:)
#else
MATH_DATATYPE(kind=rck), pointer :: aIntern_mpi_fortran_ptr(:,:,:)
#endif
MATH_DATATYPE(kind=rck), pointer :: aIntern_mpi_fortran_ptr(:,:,:)
#endif
integer(kind=c_intptr_t) :: aIntern_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
type(c_ptr) :: bcast_buffer_mpi_dev
MATH_DATATYPE(kind=rck), pointer :: bcast_buffer_mpi_fortran_ptr(:,:)
#endif
integer(kind=c_intptr_t) :: num
integer(kind=c_intptr_t) :: dev_offset, dev_offset_1
integer(kind=c_intptr_t) :: row_group_dev
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
type(c_ptr) :: row_group_mpi_dev
MATH_DATATYPE(kind=rck), pointer :: row_group_mpi_fortran_ptr(:,:)
#endif
integer(kind=c_intptr_t) :: aIntern_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
integer(kind=c_intptr_t) :: q_dev
type(c_ptr) :: q_mpi_dev
MATH_DATATYPE(kind=rck), pointer :: q_mpi_fortran_ptr(:,:)
#endif
integer(kind=c_intptr_t) :: hh_tau_dev
integer(kind=ik) :: row_group_size, unpack_idx
type(c_ptr) :: bcast_buffer_mpi_dev
MATH_DATATYPE(kind=rck), pointer :: bcast_buffer_mpi_fortran_ptr(:,:)
type(c_ptr) :: row_group_host, bcast_buffer_host
integer(kind=c_intptr_t) :: num
integer(kind=c_intptr_t) :: dev_offset, dev_offset_1
integer(kind=c_intptr_t) :: row_group_dev
integer(kind=ik) :: n_times
integer(kind=ik) :: chunk, this_chunk
type(c_ptr) :: row_group_mpi_dev
MATH_DATATYPE(kind=rck), pointer :: row_group_mpi_fortran_ptr(:,:)
MATH_DATATYPE(kind=rck), allocatable,target :: result_buffer(:,:,:)
integer(kind=c_intptr_t) :: result_buffer_dev
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
type(c_ptr) :: result_buffer_mpi_dev
MATH_DATATYPE(kind=rck), pointer :: result_buffer_mpi_fortran_ptr(:,:,:)
integer(kind=c_intptr_t) :: q_dev
type(c_ptr) :: q_mpi_dev
MATH_DATATYPE(kind=rck), pointer :: q_mpi_fortran_ptr(:,:)
#endif
MATH_DATATYPE(kind=rck), pointer :: bcast_buffer(:,:)
integer(kind=c_intptr_t) :: hh_tau_dev
integer(kind=ik) :: row_group_size, unpack_idx
type(c_ptr) :: row_group_host, bcast_buffer_host
integer(kind=ik) :: n_times
integer(kind=ik) :: chunk, this_chunk
MATH_DATATYPE(kind=rck), allocatable,target :: result_buffer(:,:,:)
integer(kind=c_intptr_t) :: result_buffer_dev
integer(kind=ik) :: n_off
type(c_ptr) :: result_buffer_mpi_dev
MATH_DATATYPE(kind=rck), pointer :: result_buffer_mpi_fortran_ptr(:,:,:)
integer(kind=MPI_KIND), allocatable :: result_send_request(:), result_recv_request(:)
integer(kind=ik), allocatable :: limits(:)
integer(kind=MPI_KIND), allocatable :: top_send_request(:), bottom_send_request(:)
integer(kind=MPI_KIND), allocatable :: top_recv_request(:), bottom_recv_request(:)
MATH_DATATYPE(kind=rck), pointer :: bcast_buffer(:,:)
integer(kind=ik) :: n_off
integer(kind=MPI_KIND), allocatable :: result_send_request(:), result_recv_request(:)
integer(kind=ik), allocatable :: limits(:)
integer(kind=MPI_KIND), allocatable :: top_send_request(:), bottom_send_request(:)
integer(kind=MPI_KIND), allocatable :: top_recv_request(:), bottom_recv_request(:)
! MPI send/recv tags, arbitrary
integer(kind=ik), parameter :: bottom_recv_tag = 111
integer(kind=ik), parameter :: top_recv_tag = 222
integer(kind=ik), parameter :: result_recv_tag = 333
integer(kind=ik), parameter :: bottom_recv_tag = 111
integer(kind=ik), parameter :: top_recv_tag = 222
integer(kind=ik), parameter :: result_recv_tag = 333
integer(kind=ik), intent(in) :: max_threads
integer(kind=ik), intent(in) :: max_threads
#ifdef WITH_OPENMP_TRADITIONAL
integer(kind=ik) :: my_thread
integer(kind=ik) :: my_thread
#endif
! Just for measuring the kernel performance
real(kind=c_double) :: kernel_time, kernel_time_recv ! MPI_WTIME always needs double
real(kind=c_double) :: kernel_time, kernel_time_recv ! MPI_WTIME always needs double
! long integer
integer(kind=lik) :: kernel_flops, kernel_flops_recv
logical, intent(in) :: wantDebug
logical :: success
integer(kind=ik) :: istat, print_flops
character(200) :: errorMessage
character(20) :: gpuString
logical :: successGPU
integer(kind=lik) :: kernel_flops, kernel_flops_recv
logical, intent(in) :: wantDebug
logical :: success
integer(kind=ik) :: istat, print_flops
character(200) :: errorMessage
character(20) :: gpuString
logical :: successGPU
#ifndef WITH_MPI
integer(kind=ik) :: j1
integer(kind=ik) :: j1
#endif
integer(kind=ik) :: error
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
integer(kind=ik) :: error
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
integer(kind=ik) :: ii,jj
#ifndef WITH_MPI
logical, parameter :: allComputeOnGPU = .true.
#else /* WITH_MPI */
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
logical, parameter :: allComputeOnGPU = .true.
#else
logical, parameter :: allComputeOnGPU = .false.
#endif
#endif /* WITH_MPI */
if(useGPU) then
gpuString = "_gpu"
else
......@@ -571,23 +580,39 @@ subroutine trans_ev_tridi_to_band_&
if (useGPU) then
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
if (wantDebug) call obj%timer%start("cuda_memcpy")
if (allComputeOnGPU) then
if (wantDebug) call obj%timer%start("cuda_memcpy")
successGPU = gpu_malloc(q_dev, ldq*matrixCols* size_of_datatype)
check_alloc_gpu("trans_ev_tridi_to_band: q_dev", successGPU)
successGPU = gpu_memcpy(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
ldq*matrixCols * size_of_datatype, &
gpuMemcpyHostToDevice)
check_memcpy_gpu("trans_ev_tridi_to_band 1: q -> q_dev", successGPU)
successGPU = gpu_malloc(q_dev, ldq*matrixCols* size_of_datatype)
check_alloc_gpu("trans_ev_tridi_to_band: q_dev", successGPU)
successGPU = gpu_memcpy(q_dev, int(loc(q(1,1)),kind=c_intptr_t), &
ldq*matrixCols * size_of_datatype, &
gpuMemcpyHostToDevice)
check_memcpy_gpu("trans_ev_tridi_to_band 1: q -> q_dev", successGPU)
! associate with c_ptr
q_mpi_dev = transfer(q_dev, q_mpi_dev)
! and associate a fortran pointer
call c_f_pointer(q_mpi_dev, q_mpi_fortran_ptr, &
[ldq,matrixCols])
if (wantDebug) call obj%timer%stop("cuda_memcpy")
! associate with c_ptr
q_mpi_dev = transfer(q_dev, q_mpi_dev)
! and associate a fortran pointer
call c_f_pointer(q_mpi_dev, q_mpi_fortran_ptr, &
[ldq,matrixCols])
if (wantDebug) call obj%timer%stop("cuda_memcpy")
#endif
print *,"hh_trans:",size(hh_trans,dim=1),size(hh_trans,dim=2)
successGPU = gpu_malloc(hh_trans_dev, size(hh_trans,dim=1)*size(hh_trans,dim=2)* size_of_datatype)
check_alloc_gpu("trans_ev_tridi_to_band: hh_trans_dev", successGPU)
! associate with c_ptr
hh_trans_mpi_dev = transfer(hh_trans_dev, hh_trans_mpi_dev)
! and associate a fortran pointer
call c_f_pointer(hh_trans_mpi_dev, hh_trans_mpi_fortran_ptr, &
[size(hh_trans,dim=1),size(hh_trans,dim=2)])
successGPU = gpu_memcpy(c_loc(hh_trans_mpi_fortran_ptr(1,1)), &
c_loc(hh_trans(1,1)), &
size(hh_trans,dim=1)*size(hh_trans,dim=2) * size_of_datatype, &
gpuMemcpyHostToDevice)
check_memcpy_gpu("trans_ev_tridi_to_band: hh_trans -> hh_trans_dev", successGPU)
endif ! allComputeOnGPU
num = (stripe_width*a_dim2*stripe_count)* size_of_datatype
successGPU = gpu_malloc(aIntern_dev, stripe_width*a_dim2*stripe_count* size_of_datatype)
......@@ -596,18 +621,18 @@ subroutine trans_ev_tridi_to_band_&
successGPU = gpu_memset(aIntern_dev , 0, num)
check_memset_gpu("trans_ev_tridi_to_band: aIntern_dev", successGPU)
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
! associate with c_ptr
aIntern_mpi_dev = transfer(aIntern_dev, aIntern_mpi_dev)
! and associate a fortran pointer
if (allComputeOnGPU) then
! associate with c_ptr
aIntern_mpi_dev = transfer(aIntern_dev, aIntern_mpi_dev)
! and associate a fortran pointer
#ifdef WITH_OPENMP_TRADITIONAL
call c_f_pointer(aIntern_mpi_dev, aIntern_mpi_fortran_ptr, &
[stripe_width,a_dim2,stripe_count,max_threads])
call c_f_pointer(aIntern_mpi_dev, aIntern_mpi_fortran_ptr, &
[stripe_width,a_dim2,stripe_count,max_threads])
#else
call c_f_pointer(aIntern_mpi_dev, aIntern_mpi_fortran_ptr, &
[stripe_width,a_dim2,stripe_count])
call c_f_pointer(aIntern_mpi_dev, aIntern_mpi_fortran_ptr, &
[stripe_width,a_dim2,stripe_count])
#endif
#endif /* WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND */
endif ! allComputeOnGPU
! "row_group" and "row_group_dev" are needed for GPU optimizations
successGPU = gpu_malloc_host(row_group_host,l_nev*nblk*size_of_datatype)
......@@ -622,13 +647,13 @@ subroutine trans_ev_tridi_to_band_&
successGPU = gpu_memset(row_group_dev , 0, num)
check_memset_gpu("trans_ev_tridi_to_band: row_group_dev", successGPU)
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
! associate with c_ptr
row_group_mpi_dev = transfer(row_group_dev, row_group_mpi_dev)
! and associate a fortran pointer
call c_f_pointer(row_group_mpi_dev, row_group_mpi_fortran_ptr, &
[l_nev,nblk])
#endif
if (allComputeOnGPU) then
! associate with c_ptr
row_group_mpi_dev = transfer(row_group_dev, row_group_mpi_dev)
! and associate a fortran pointer
call c_f_pointer(row_group_mpi_dev, row_group_mpi_fortran_ptr, &
[l_nev,nblk])
endif ! allComputeOnGPU
else ! GPUs are not used
......@@ -671,8 +696,7 @@ subroutine trans_ev_tridi_to_band_&
row(:) = 0.0_rck
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
if (useGPU) then
if (useGPU .and. allComputeOnGPU) then
num = (l_nev)* size_of_datatype
successGPU = gpu_malloc(row_dev, num)
check_alloc_gpu("trans_ev_tridi_to_band: row_dev", successGPU)
......@@ -686,8 +710,6 @@ subroutine trans_ev_tridi_to_band_&
call c_f_pointer(row_mpi_dev, row_mpi_fortran_ptr, &
[l_nev])
endif
#endif
! Copy q from a block cyclic distribution into a distribution with contiguous rows,
! and transpose the matrix using stripes of given stripe_width for cache blocking.
......@@ -737,7 +759,7 @@ subroutine trans_ev_tridi_to_band_&
row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, &
i - limits(ip), .false., wantDebug)
i - limits(ip), .false., wantDebug, allComputeOnGPU)
#ifdef WITH_MPI
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
......@@ -755,7 +777,23 @@ subroutine trans_ev_tridi_to_band_&
#endif /* WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND */
#else /* WITH_MPI */
row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct?
if (allComputeOnGPU) then
! memcopy row_dev -> row_group_dev
successGPU = gpu_memcpy(c_loc(row_group_mpi_fortran_ptr(1,row_group_size)), &
c_loc(row_mpi_fortran_ptr(1)), &
l_nev* size_of_datatype, &
gpuMemcpyDeviceToDevice)
check_memcpy_gpu("trans_ev_tridi_to_band: row_dev -> row_group_dev", successGPU)
if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
successGPU = gpu_devicesynchronize()
check_memcpy_gpu("trans_ev_tridi_to_band: device_synchronize", successGPU)
if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
else ! allComputeOnGPU
row_group(1:l_nev, row_group_size) = row(1:l_nev)
endif ! allComputeOnGPU
#endif /* WITH_MPI */
else ! useGPU
......@@ -764,11 +802,8 @@ subroutine trans_ev_tridi_to_band_&
call MPI_Recv(row, int(l_nev,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
int(src,kind=MPI_KIND), 0_MPI_KIND, int(mpi_comm_rows,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
! row(1:l_nev) = row(1:l_nev)
#endif /* WITH_MPI */
call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
......@@ -804,7 +839,7 @@ subroutine trans_ev_tridi_to_band_&
row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, &
i - limits(ip), .false., wantDebug)
i - limits(ip), .false., wantDebug, allComputeOnGPU)
#ifdef WITH_MPI
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
if (wantDebug) call obj%timer%start("cuda_mpi_communication")
......@@ -819,7 +854,23 @@ subroutine trans_ev_tridi_to_band_&
if (wantDebug) call obj%timer%stop("host_mpi_communication")
#endif /* WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND */
#else /* WITH_MPI */
row_group(1:l_nev, row_group_size) = row(1:l_nev)
if (allComputeOnGPU) then
! memcpy row_dev -> row_group_dev
successGPU = gpu_memcpy(c_loc(row_group_mpi_fortran_ptr(1,row_group_size)), &
c_loc(row_mpi_fortran_ptr(1)), &
l_nev* size_of_datatype, &
gpuMemcpyDeviceToDevice)
check_memcpy_gpu("trans_ev_tridi_to_band: row_dev -> row_group_dev", successGPU)
if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
successGPU = gpu_devicesynchronize()
check_memcpy_gpu("trans_ev_tridi_to_band: device_synchronize", successGPU)
if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
else
row_group(1:l_nev, row_group_size) = row(1:l_nev)
endif
#endif /* WITH_MPI */
else ! useGPU
......@@ -828,11 +879,8 @@ subroutine trans_ev_tridi_to_band_&
call MPI_Recv(row, int(l_nev,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
int(src,kind=MPI_KIND), 0_MPI_KIND, int(mpi_comm_rows,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
! row(1:l_nev) = row(1:l_nev)
#endif /* WITH_MPI */
call unpack_row_&
......@@ -860,16 +908,16 @@ subroutine trans_ev_tridi_to_band_&
row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, &
i - limits(ip), .false., wantDebug)
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
if (wantDebug) call obj%timer%start("cuda_aware_gpublas")
call gpublas_PRECISION_COPY(l_nev, c_loc(q_mpi_fortran_ptr(src_offset,1)), ldq, &
c_loc(row_group_mpi_fortran_ptr(1,row_group_size)), 1)
if (wantDebug) call obj%timer%stop("cuda_aware_gpublas")
#else /* WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND */
row_group(:, row_group_size) = q(src_offset, 1:l_nev)
#endif /* WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND */
i - limits(ip), .false., wantDebug, allComputeOnGPU)
if (allComputeOnGPU) then
if (wantDebug) call obj%timer%start("cuda_aware_gpublas")
call gpublas_PRECISION_COPY(l_nev, c_loc(q_mpi_fortran_ptr(src_offset,1)), ldq, &
c_loc(row_group_mpi_fortran_ptr(1,row_group_size)), 1)
if (wantDebug) call obj%timer%stop("cuda_aware_gpublas")
else ! allComputeOnGPU
row_group(:, row_group_size) = q(src_offset, 1:l_nev)
endif ! allComputeOnGPU
else ! useGPU
row(:) = q(src_offset, 1:l_nev)
endif ! useGPU
......@@ -918,24 +966,22 @@ subroutine trans_ev_tridi_to_band_&
! Send all rows which have not yet been send
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
src_offset = 0
do dst = 0, ip-1
do i=limits(dst)+1,limits(dst+1)
if (mod((i-1)/nblk, np_rows) == my_prow) then
src_offset = src_offset+1
if (allComputeOnGPU .and. useGPU) then
src_offset = 0
do dst = 0, ip-1
do i=limits(dst)+1,limits(dst+1)
if (mod((i-1)/nblk, np_rows) == my_prow) then
src_offset = src_offset+1
if (wantDebug) call obj%timer%start("cuda_aware_gpublas")
call gpublas_PRECISION_COPY(l_nev, c_loc(q_mpi_fortran_ptr(src_offset,1)), ldq, &
c_loc(row_mpi_fortran_ptr(1)), 1)
if (wantDebug) call obj%timer%stop("cuda_aware_gpublas")
if (wantDebug) call obj%timer%start("cuda_aware_gpublas")
call gpublas_PRECISION_COPY(l_nev, c_loc(q_mpi_fortran_ptr(src_offset,1)), ldq, &
c_loc(row_mpi_fortran_ptr(1)), 1)
if (wantDebug) call obj%timer%stop("cuda_aware_gpublas")
!! this is needed other wise
! yes
if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
successGPU = gpu_devicesynchronize()
check_memcpy_gpu("trans_ev_tridi_to_band: device_synchronize", successGPU)
if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
successGPU = gpu_devicesynchronize()
check_memcpy_gpu("trans_ev_tridi_to_band: device_synchronize", successGPU)
if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("cuda_mpi_communication")
......@@ -944,16 +990,16 @@ subroutine trans_ev_tridi_to_band_&
if (wantDebug) call obj%timer%stop("cuda_mpi_communication")
#endif /* WITH_MPI */
endif
endif
enddo
enddo
enddo
#else /* WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND */
src_offset = 0
do dst = 0, ip-1
do i=limits(dst)+1,limits(dst+1)
if (mod((i-1)/nblk, np_rows) == my_prow) then
src_offset = src_offset+1
row(:) = q(src_offset, 1:l_nev)