Commit c79d2841 authored by Andreas Marek's avatar Andreas Marek
Browse files

New GPU coda path in trans_ev_tridi also for OpenMP only

parent 899f8870
......@@ -55,7 +55,8 @@ subroutine pack_row_group_&
&_gpu_&
&PRECISION &
(obj, row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, &
rows, n_offset, row_count, result_buffer_dev, nblk, num_result_buffers, nbuf, doCopyResult, wantDebug)
rows, n_offset, row_count, result_buffer_dev, nblk, num_result_buffers, nbuf, doCopyResult, &
wantDebug, allComputeOnGPU)
use gpu_c_kernel
use elpa_gpu
use elpa_abstract_impl
......@@ -71,14 +72,13 @@ subroutine pack_row_group_&
real(kind=C_DATATYPE_KIND) :: rows(:,:)
#endif
#if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND) :: rows(:,:)
complex(kind=C_DATATYPE_KIND) :: rows(:,:)
#endif
integer(kind=ik) :: max_idx
logical :: successGPU
logical, intent(in) :: doCopyResult, wantDebug
logical, intent(in) :: doCopyResult, wantDebug, allComputeOnGPU
integer(kind=ik), intent(in) :: nblk, nbuf
integer(kind=ik), intent(in) :: num_result_buffers
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
type(c_ptr) :: result_buffer_mpi_dev
#if REALCASE == 1
real(kind=C_DATATYPE_KIND), pointer :: result_buffer_mpi_fortran_ptr(:,:,:)
......@@ -86,16 +86,17 @@ subroutine pack_row_group_&
#if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND), pointer :: result_buffer_mpi_fortran_ptr(:,:,:)
#endif
#endif
if (wantDebug) call obj%timer%start("pack_row_group")
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
! associate with c_ptr
result_buffer_mpi_dev = transfer(result_buffer_dev, result_buffer_mpi_dev)
! and associate a fortran pointer
call c_f_pointer(result_buffer_mpi_dev, result_buffer_mpi_fortran_ptr, &
[l_nev,nblk,num_result_buffers])
#endif
if (allComputeOnGPU) then
! associate with c_ptr
result_buffer_mpi_dev = transfer(result_buffer_dev, result_buffer_mpi_dev)
! and associate a fortran pointer
call c_f_pointer(result_buffer_mpi_dev, result_buffer_mpi_fortran_ptr, &
[l_nev,nblk,num_result_buffers])
endif
! Use many blocks for higher GPU occupancy
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
......@@ -108,29 +109,12 @@ subroutine pack_row_group_&
&PRECISION &
(row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev)
#ifndef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
successGPU = gpu_memcpy(int(loc(rows(:, 1: row_count)),kind=c_intptr_t), row_group_dev , row_count * l_nev * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
& , gpuMemcpyDeviceToHost)
if (.not.(successGPU)) then
print *,"pack_row_group_&
&MATH_DATATYPE&
&_gpu_&
if (.not.(allComputeOnGPU)) then
successGPU = gpu_memcpy(int(loc(rows(:, 1: row_count)),kind=c_intptr_t), row_group_dev , row_count * l_nev * size_of_&
&PRECISION&
&: error in cudaMemcpy"
stop 1
endif
#else
if (doCopyResult) then
! need to copy row_group_dev -> result_buffer_dev
successGPU = gpu_memcpy(c_loc(result_buffer_mpi_fortran_ptr(1, 1, nbuf)), &
row_group_dev , row_count * l_nev * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
& , gpuMemcpyDeviceToDevice)
&_&
&MATH_DATATYPE&
& , gpuMemcpyDeviceToHost)
if (.not.(successGPU)) then
print *,"pack_row_group_&
&MATH_DATATYPE&
......@@ -139,9 +123,26 @@ subroutine pack_row_group_&
&: error in cudaMemcpy"
stop 1
endif
else ! allComputeOnGPU
if (doCopyResult) then
! need to copy row_group_dev -> result_buffer_dev
successGPU = gpu_memcpy(c_loc(result_buffer_mpi_fortran_ptr(1, 1, nbuf)), &
row_group_dev , row_count * l_nev * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
& , gpuMemcpyDeviceToDevice)
if (.not.(successGPU)) then
print *,"pack_row_group_&
&MATH_DATATYPE&
&_gpu_&
&PRECISION&
&: error in cudaMemcpy"
stop 1
endif
endif
#endif
endif
endif ! allComputeOnGPU
if (wantDebug) call obj%timer%stop("pack_row_group")
end subroutine
......@@ -149,162 +150,161 @@ end subroutine
! Unpack a filled row group (i.e. an array of consecutive rows)
subroutine unpack_row_group_&
&MATH_DATATYPE&
&_gpu_&
&PRECISION &
(obj, row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &
a_dim2, l_nev, rows, n_offset, row_count, wantDebug)
use gpu_c_kernel
use elpa_abstract_impl
&MATH_DATATYPE&
&_gpu_&
&PRECISION &
(obj, row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &
a_dim2, l_nev, rows, n_offset, row_count, wantDebug, allComputeOnGPU)
use gpu_c_kernel
use elpa_abstract_impl
use precision
use, intrinsic :: iso_c_binding
use elpa_gpu
implicit none
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=c_intptr_t) :: row_group_dev, a_dev
integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
integer(kind=ik), intent(in) :: n_offset, row_count
use precision
use, intrinsic :: iso_c_binding
use elpa_gpu
implicit none
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=c_intptr_t) :: row_group_dev, a_dev
integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
integer(kind=ik), intent(in) :: n_offset, row_count
#if REALCASE == 1
real(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
real(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
#endif
#if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
complex(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
#endif
integer(kind=ik) :: max_idx
logical :: successGPU
logical, intent(in) :: wantDebug
integer(kind=ik) :: max_idx
logical :: successGPU
logical, intent(in) :: wantDebug, allComputeOnGPU
if (wantDebug) call obj%timer%start("unpack_row_group")
! Use many blocks for higher GPU occupancy
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
if (wantDebug) call obj%timer%start("unpack_row_group")
! Use many blocks for higher GPU occupancy
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
#ifndef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
successGPU = gpu_memcpy( row_group_dev , int(loc(rows(1, 1)),kind=c_intptr_t),row_count * l_nev * &
size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&, gpuMemcpyHostToDevice)
if (.not.(successGPU)) then
if (.not.(allComputeOnGPU)) then
successGPU = gpu_memcpy(row_group_dev , int(loc(rows(1, 1)),kind=c_intptr_t),row_count * l_nev * &
size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&, gpuMemcpyHostToDevice)
if (.not.(successGPU)) then
print *,"unpack_row_group_&
&MATH_DATATYPE&
&_gpu_&
&PRECISION&
&: error in cudaMemcpy"
stop 1
endif
#endif
endif
endif ! allComputeOnGPU
! only read access to row_group_dev
call launch_my_unpack_gpu_kernel_&
&MATH_DATATYPE&
&_&
&PRECISION &
( row_count, n_offset, max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
! only read access to row_group_dev
call launch_my_unpack_gpu_kernel_&
&MATH_DATATYPE&
&_&
&PRECISION &
( row_count, n_offset, max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
row_group_dev,a_dev)
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
successGPU = gpu_devicesynchronize()
if (.not.(successGPU)) then
print *,"unpack_row_group_&
&MATH_DATATYPE&
&_gpu_&
&PRECISION&
&: error in cudaMemcpy"
stop 1
endif
if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
#endif
if (wantDebug) call obj%timer%stop("unpack_row_group")
if (allComputeOnGPU) then
if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
successGPU = gpu_devicesynchronize()
if (.not.(successGPU)) then
print *,"unpack_row_group_&
&MATH_DATATYPE&
&_gpu_&
&PRECISION&
&: error in cudaMemcpy"
stop 1
endif
if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
endif ! allComputeOnGPU
if (wantDebug) call obj%timer%stop("unpack_row_group")
end subroutine
! This subroutine must be called before queuing the next row for unpacking; it ensures that an unpacking of the current row group
! occurs when the queue is full or when the next row belongs to another group
subroutine unpack_and_prepare_row_group_&
&MATH_DATATYPE&
&_gpu_&
&PRECISION &
(obj, row_group, row_group_dev, a_dev, stripe_count, stripe_width, &
last_stripe_width, a_dim2, l_nev, row_group_size, nblk, &
unpack_idx, next_unpack_idx, force, wantDebug)
&MATH_DATATYPE&
&_gpu_&
&PRECISION &
(obj, row_group, row_group_dev, a_dev, stripe_count, stripe_width, &
last_stripe_width, a_dim2, l_nev, row_group_size, nblk, &
unpack_idx, next_unpack_idx, force, wantDebug, allComputeOnGPU)
use, intrinsic :: iso_c_binding
use precision
use gpu_c_kernel
use elpa_abstract_impl
use, intrinsic :: iso_c_binding
use precision
use gpu_c_kernel
use elpa_abstract_impl
implicit none
implicit none
class(elpa_abstract_impl_t), intent(inout) :: obj
class(elpa_abstract_impl_t), intent(inout) :: obj
#if REALCASE == 1
real(kind=C_DATATYPE_KIND) :: row_group(:,:)
real(kind=C_DATATYPE_KIND) :: row_group(:,:)
#endif
#if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND) :: row_group(:,:)
complex(kind=C_DATATYPE_KIND) :: row_group(:,:)
#endif
integer(kind=c_intptr_t) :: row_group_dev, a_dev
integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
integer(kind=ik), intent(inout) :: row_group_size
integer(kind=ik), intent(in) :: nblk
integer(kind=ik), intent(inout) :: unpack_idx
integer(kind=ik), intent(in) :: next_unpack_idx
logical, intent(in) :: force, wantDebug
integer(kind=c_intptr_t) :: row_group_dev, a_dev
integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
integer(kind=ik), intent(inout) :: row_group_size
integer(kind=ik), intent(in) :: nblk
integer(kind=ik), intent(inout) :: unpack_idx
integer(kind=ik), intent(in) :: next_unpack_idx
logical, intent(in) :: force, wantDebug, allComputeOnGPU
if (wantDebug) call obj%timer%start("unpack_and_prepare_row_group")
if (wantDebug) call obj%timer%start("unpack_and_prepare_row_group")
if (row_group_size == 0) then
! Nothing to flush, just prepare for the upcoming row
row_group_size = 1
else
if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /= next_unpack_idx)) then
! A flush and a reset must be performed
call unpack_row_group_&
&MATH_DATATYPE&
&_gpu_&
&PRECISION&
(obj, row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &
a_dim2, l_nev, row_group(:, :), unpack_idx - row_group_size, row_group_size, &
wantDebug)
row_group_size = 1
else
! Just prepare for the upcoming row
row_group_size = row_group_size + 1
endif
endif
! Always update the index for the upcoming row
unpack_idx = next_unpack_idx
if (row_group_size == 0) then
! Nothing to flush, just prepare for the upcoming row
row_group_size = 1
else
if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /= next_unpack_idx)) then
! A flush and a reset must be performed
call unpack_row_group_&
&MATH_DATATYPE&
&_gpu_&
&PRECISION&
(obj, row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &
a_dim2, l_nev, row_group(:, :), unpack_idx - row_group_size, row_group_size, &
wantDebug, allComputeOnGPU)
row_group_size = 1
else
! Just prepare for the upcoming row
row_group_size = row_group_size + 1
endif
endif
! Always update the index for the upcoming row
unpack_idx = next_unpack_idx
if (wantDebug) call obj%timer%stop("unpack_and_prepare_row_group")
if (wantDebug) call obj%timer%stop("unpack_and_prepare_row_group")
end subroutine
! The host wrapper for extracting "tau" from the HH reflectors (see the kernel below)
subroutine extract_hh_tau_&
&MATH_DATATYPE&
&_gpu_&
&PRECISION&
& (bcast_buffer_dev, hh_tau_dev, nbw, n, is_zero)
use gpu_c_kernel
use precision
use, intrinsic :: iso_c_binding
implicit none
integer(kind=c_intptr_t) :: bcast_buffer_dev, hh_tau_dev
integer(kind=ik), value :: nbw, n
logical, value :: is_zero
integer(kind=ik) :: val_is_zero
if (is_zero) then
val_is_zero = 1
else
val_is_zero = 0
endif
&MATH_DATATYPE&
&_gpu_&
&PRECISION&
& (bcast_buffer_dev, hh_tau_dev, nbw, n, is_zero)
use gpu_c_kernel
use precision
use, intrinsic :: iso_c_binding
implicit none
integer(kind=c_intptr_t) :: bcast_buffer_dev, hh_tau_dev
integer(kind=ik), value :: nbw, n
logical, value :: is_zero
integer(kind=ik) :: val_is_zero
if (is_zero) then
val_is_zero = 1
else
val_is_zero = 0
endif
call launch_extract_hh_tau_gpu_kernel_&
&MATH_DATATYPE&
&_&
&PRECISION&
& (bcast_buffer_dev, hh_tau_dev, nbw, n, val_is_zero)
call launch_extract_hh_tau_gpu_kernel_&
&MATH_DATATYPE&
&_&
&PRECISION&
& (bcast_buffer_dev, hh_tau_dev, nbw, n, val_is_zero)
end subroutine
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment