Commit c79d2841 authored by Andreas Marek's avatar Andreas Marek
Browse files

New GPU coda path in trans_ev_tridi also for OpenMP only

parent 899f8870
......@@ -55,7 +55,8 @@ subroutine pack_row_group_&
&_gpu_&
&PRECISION &
(obj, row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, &
rows, n_offset, row_count, result_buffer_dev, nblk, num_result_buffers, nbuf, doCopyResult, wantDebug)
rows, n_offset, row_count, result_buffer_dev, nblk, num_result_buffers, nbuf, doCopyResult, &
wantDebug, allComputeOnGPU)
use gpu_c_kernel
use elpa_gpu
use elpa_abstract_impl
......@@ -75,10 +76,9 @@ subroutine pack_row_group_&
#endif
integer(kind=ik) :: max_idx
logical :: successGPU
logical, intent(in) :: doCopyResult, wantDebug
logical, intent(in) :: doCopyResult, wantDebug, allComputeOnGPU
integer(kind=ik), intent(in) :: nblk, nbuf
integer(kind=ik), intent(in) :: num_result_buffers
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
type(c_ptr) :: result_buffer_mpi_dev
#if REALCASE == 1
real(kind=C_DATATYPE_KIND), pointer :: result_buffer_mpi_fortran_ptr(:,:,:)
......@@ -86,16 +86,17 @@ subroutine pack_row_group_&
#if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND), pointer :: result_buffer_mpi_fortran_ptr(:,:,:)
#endif
#endif
if (wantDebug) call obj%timer%start("pack_row_group")
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
if (allComputeOnGPU) then
! associate with c_ptr
result_buffer_mpi_dev = transfer(result_buffer_dev, result_buffer_mpi_dev)
! and associate a fortran pointer
call c_f_pointer(result_buffer_mpi_dev, result_buffer_mpi_fortran_ptr, &
[l_nev,nblk,num_result_buffers])
#endif
endif
! Use many blocks for higher GPU occupancy
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
......@@ -108,7 +109,7 @@ subroutine pack_row_group_&
&PRECISION &
(row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev)
#ifndef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
if (.not.(allComputeOnGPU)) then
successGPU = gpu_memcpy(int(loc(rows(:, 1: row_count)),kind=c_intptr_t), row_group_dev , row_count * l_nev * size_of_&
&PRECISION&
&_&
......@@ -122,7 +123,7 @@ subroutine pack_row_group_&
&: error in cudaMemcpy"
stop 1
endif
#else
else ! allComputeOnGPU
if (doCopyResult) then
! need to copy row_group_dev -> result_buffer_dev
successGPU = gpu_memcpy(c_loc(result_buffer_mpi_fortran_ptr(1, 1, nbuf)), &
......@@ -141,7 +142,7 @@ subroutine pack_row_group_&
endif
endif
#endif
endif ! allComputeOnGPU
if (wantDebug) call obj%timer%stop("pack_row_group")
end subroutine
......@@ -153,7 +154,7 @@ subroutine unpack_row_group_&
&_gpu_&
&PRECISION &
(obj, row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &
a_dim2, l_nev, rows, n_offset, row_count, wantDebug)
a_dim2, l_nev, rows, n_offset, row_count, wantDebug, allComputeOnGPU)
use gpu_c_kernel
use elpa_abstract_impl
......@@ -174,14 +175,14 @@ subroutine unpack_row_group_&
integer(kind=ik) :: max_idx
logical :: successGPU
logical, intent(in) :: wantDebug
logical, intent(in) :: wantDebug, allComputeOnGPU
if (wantDebug) call obj%timer%start("unpack_row_group")
! Use many blocks for higher GPU occupancy
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
#ifndef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
successGPU = gpu_memcpy( row_group_dev , int(loc(rows(1, 1)),kind=c_intptr_t),row_count * l_nev * &
if (.not.(allComputeOnGPU)) then
successGPU = gpu_memcpy(row_group_dev , int(loc(rows(1, 1)),kind=c_intptr_t),row_count * l_nev * &
size_of_&
&PRECISION&
&_&
......@@ -195,7 +196,7 @@ subroutine unpack_row_group_&
&: error in cudaMemcpy"
stop 1
endif
#endif
endif ! allComputeOnGPU
! only read access to row_group_dev
call launch_my_unpack_gpu_kernel_&
......@@ -205,8 +206,7 @@ subroutine unpack_row_group_&
( row_count, n_offset, max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
row_group_dev,a_dev)
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
if (allComputeOnGPU) then
if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
successGPU = gpu_devicesynchronize()
if (.not.(successGPU)) then
......@@ -218,7 +218,7 @@ subroutine unpack_row_group_&
stop 1
endif
if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
#endif
endif ! allComputeOnGPU
if (wantDebug) call obj%timer%stop("unpack_row_group")
end subroutine
......@@ -231,7 +231,7 @@ subroutine unpack_and_prepare_row_group_&
&PRECISION &
(obj, row_group, row_group_dev, a_dev, stripe_count, stripe_width, &
last_stripe_width, a_dim2, l_nev, row_group_size, nblk, &
unpack_idx, next_unpack_idx, force, wantDebug)
unpack_idx, next_unpack_idx, force, wantDebug, allComputeOnGPU)
use, intrinsic :: iso_c_binding
use precision
......@@ -253,7 +253,7 @@ subroutine unpack_and_prepare_row_group_&
integer(kind=ik), intent(in) :: nblk
integer(kind=ik), intent(inout) :: unpack_idx
integer(kind=ik), intent(in) :: next_unpack_idx
logical, intent(in) :: force, wantDebug
logical, intent(in) :: force, wantDebug, allComputeOnGPU
if (wantDebug) call obj%timer%start("unpack_and_prepare_row_group")
......@@ -269,7 +269,7 @@ subroutine unpack_and_prepare_row_group_&
&PRECISION&
(obj, row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &
a_dim2, l_nev, row_group(:, :), unpack_idx - row_group_size, row_group_size, &
wantDebug)
wantDebug, allComputeOnGPU)
row_group_size = 1
else
! Just prepare for the upcoming row
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment