Commit c79d2841 authored by Andreas Marek's avatar Andreas Marek
Browse files

New GPU coda path in trans_ev_tridi also for OpenMP only

parent 899f8870
...@@ -55,7 +55,8 @@ subroutine pack_row_group_& ...@@ -55,7 +55,8 @@ subroutine pack_row_group_&
&_gpu_& &_gpu_&
&PRECISION & &PRECISION &
(obj, row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, & (obj, row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, &
rows, n_offset, row_count, result_buffer_dev, nblk, num_result_buffers, nbuf, doCopyResult, wantDebug) rows, n_offset, row_count, result_buffer_dev, nblk, num_result_buffers, nbuf, doCopyResult, &
wantDebug, allComputeOnGPU)
use gpu_c_kernel use gpu_c_kernel
use elpa_gpu use elpa_gpu
use elpa_abstract_impl use elpa_abstract_impl
...@@ -71,14 +72,13 @@ subroutine pack_row_group_& ...@@ -71,14 +72,13 @@ subroutine pack_row_group_&
real(kind=C_DATATYPE_KIND) :: rows(:,:) real(kind=C_DATATYPE_KIND) :: rows(:,:)
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND) :: rows(:,:) complex(kind=C_DATATYPE_KIND) :: rows(:,:)
#endif #endif
integer(kind=ik) :: max_idx integer(kind=ik) :: max_idx
logical :: successGPU logical :: successGPU
logical, intent(in) :: doCopyResult, wantDebug logical, intent(in) :: doCopyResult, wantDebug, allComputeOnGPU
integer(kind=ik), intent(in) :: nblk, nbuf integer(kind=ik), intent(in) :: nblk, nbuf
integer(kind=ik), intent(in) :: num_result_buffers integer(kind=ik), intent(in) :: num_result_buffers
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
type(c_ptr) :: result_buffer_mpi_dev type(c_ptr) :: result_buffer_mpi_dev
#if REALCASE == 1 #if REALCASE == 1
real(kind=C_DATATYPE_KIND), pointer :: result_buffer_mpi_fortran_ptr(:,:,:) real(kind=C_DATATYPE_KIND), pointer :: result_buffer_mpi_fortran_ptr(:,:,:)
...@@ -86,16 +86,17 @@ subroutine pack_row_group_& ...@@ -86,16 +86,17 @@ subroutine pack_row_group_&
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND), pointer :: result_buffer_mpi_fortran_ptr(:,:,:) complex(kind=C_DATATYPE_KIND), pointer :: result_buffer_mpi_fortran_ptr(:,:,:)
#endif #endif
#endif
if (wantDebug) call obj%timer%start("pack_row_group") if (wantDebug) call obj%timer%start("pack_row_group")
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
! associate with c_ptr if (allComputeOnGPU) then
result_buffer_mpi_dev = transfer(result_buffer_dev, result_buffer_mpi_dev) ! associate with c_ptr
! and associate a fortran pointer result_buffer_mpi_dev = transfer(result_buffer_dev, result_buffer_mpi_dev)
call c_f_pointer(result_buffer_mpi_dev, result_buffer_mpi_fortran_ptr, & ! and associate a fortran pointer
[l_nev,nblk,num_result_buffers]) call c_f_pointer(result_buffer_mpi_dev, result_buffer_mpi_fortran_ptr, &
#endif [l_nev,nblk,num_result_buffers])
endif
! Use many blocks for higher GPU occupancy ! Use many blocks for higher GPU occupancy
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
...@@ -108,29 +109,12 @@ subroutine pack_row_group_& ...@@ -108,29 +109,12 @@ subroutine pack_row_group_&
&PRECISION & &PRECISION &
(row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev) (row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev)
#ifndef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND if (.not.(allComputeOnGPU)) then
successGPU = gpu_memcpy(int(loc(rows(:, 1: row_count)),kind=c_intptr_t), row_group_dev , row_count * l_nev * size_of_& successGPU = gpu_memcpy(int(loc(rows(:, 1: row_count)),kind=c_intptr_t), row_group_dev , row_count * l_nev * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
& , gpuMemcpyDeviceToHost)
if (.not.(successGPU)) then
print *,"pack_row_group_&
&MATH_DATATYPE&
&_gpu_&
&PRECISION& &PRECISION&
&: error in cudaMemcpy" &_&
stop 1 &MATH_DATATYPE&
endif & , gpuMemcpyDeviceToHost)
#else
if (doCopyResult) then
! need to copy row_group_dev -> result_buffer_dev
successGPU = gpu_memcpy(c_loc(result_buffer_mpi_fortran_ptr(1, 1, nbuf)), &
row_group_dev , row_count * l_nev * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
& , gpuMemcpyDeviceToDevice)
if (.not.(successGPU)) then if (.not.(successGPU)) then
print *,"pack_row_group_& print *,"pack_row_group_&
&MATH_DATATYPE& &MATH_DATATYPE&
...@@ -139,9 +123,26 @@ subroutine pack_row_group_& ...@@ -139,9 +123,26 @@ subroutine pack_row_group_&
&: error in cudaMemcpy" &: error in cudaMemcpy"
stop 1 stop 1
endif endif
else ! allComputeOnGPU
if (doCopyResult) then
! need to copy row_group_dev -> result_buffer_dev
successGPU = gpu_memcpy(c_loc(result_buffer_mpi_fortran_ptr(1, 1, nbuf)), &
row_group_dev , row_count * l_nev * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
& , gpuMemcpyDeviceToDevice)
if (.not.(successGPU)) then
print *,"pack_row_group_&
&MATH_DATATYPE&
&_gpu_&
&PRECISION&
&: error in cudaMemcpy"
stop 1
endif
endif endif
#endif endif ! allComputeOnGPU
if (wantDebug) call obj%timer%stop("pack_row_group") if (wantDebug) call obj%timer%stop("pack_row_group")
end subroutine end subroutine
...@@ -149,162 +150,161 @@ end subroutine ...@@ -149,162 +150,161 @@ end subroutine
! Unpack a filled row group (i.e. an array of consecutive rows) ! Unpack a filled row group (i.e. an array of consecutive rows)
subroutine unpack_row_group_& subroutine unpack_row_group_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_gpu_& &_gpu_&
&PRECISION & &PRECISION &
(obj, row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, & (obj, row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &
a_dim2, l_nev, rows, n_offset, row_count, wantDebug) a_dim2, l_nev, rows, n_offset, row_count, wantDebug, allComputeOnGPU)
use gpu_c_kernel use gpu_c_kernel
use elpa_abstract_impl use elpa_abstract_impl
use precision use precision
use, intrinsic :: iso_c_binding use, intrinsic :: iso_c_binding
use elpa_gpu use elpa_gpu
implicit none implicit none
class(elpa_abstract_impl_t), intent(inout) :: obj class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=c_intptr_t) :: row_group_dev, a_dev integer(kind=c_intptr_t) :: row_group_dev, a_dev
integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
integer(kind=ik), intent(in) :: n_offset, row_count integer(kind=ik), intent(in) :: n_offset, row_count
#if REALCASE == 1 #if REALCASE == 1
real(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :) real(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :) complex(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
#endif #endif
integer(kind=ik) :: max_idx integer(kind=ik) :: max_idx
logical :: successGPU logical :: successGPU
logical, intent(in) :: wantDebug logical, intent(in) :: wantDebug, allComputeOnGPU
if (wantDebug) call obj%timer%start("unpack_row_group") if (wantDebug) call obj%timer%start("unpack_row_group")
! Use many blocks for higher GPU occupancy ! Use many blocks for higher GPU occupancy
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
#ifndef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND if (.not.(allComputeOnGPU)) then
successGPU = gpu_memcpy( row_group_dev , int(loc(rows(1, 1)),kind=c_intptr_t),row_count * l_nev * & successGPU = gpu_memcpy(row_group_dev , int(loc(rows(1, 1)),kind=c_intptr_t),row_count * l_nev * &
size_of_& size_of_&
&PRECISION& &PRECISION&
&_& &_&
&MATH_DATATYPE& &MATH_DATATYPE&
&, gpuMemcpyHostToDevice) &, gpuMemcpyHostToDevice)
if (.not.(successGPU)) then if (.not.(successGPU)) then
print *,"unpack_row_group_& print *,"unpack_row_group_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_gpu_& &_gpu_&
&PRECISION& &PRECISION&
&: error in cudaMemcpy" &: error in cudaMemcpy"
stop 1 stop 1
endif endif
#endif endif ! allComputeOnGPU
! only read access to row_group_dev ! only read access to row_group_dev
call launch_my_unpack_gpu_kernel_& call launch_my_unpack_gpu_kernel_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_& &_&
&PRECISION & &PRECISION &
( row_count, n_offset, max_idx,stripe_width,a_dim2, stripe_count, l_nev, & ( row_count, n_offset, max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
row_group_dev,a_dev) row_group_dev,a_dev)
if (allComputeOnGPU) then
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize") successGPU = gpu_devicesynchronize()
successGPU = gpu_devicesynchronize() if (.not.(successGPU)) then
if (.not.(successGPU)) then print *,"unpack_row_group_&
print *,"unpack_row_group_& &MATH_DATATYPE&
&MATH_DATATYPE& &_gpu_&
&_gpu_& &PRECISION&
&PRECISION& &: error in cudaMemcpy"
&: error in cudaMemcpy" stop 1
stop 1 endif
endif if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize") endif ! allComputeOnGPU
#endif if (wantDebug) call obj%timer%stop("unpack_row_group")
if (wantDebug) call obj%timer%stop("unpack_row_group")
end subroutine end subroutine
! This subroutine must be called before queuing the next row for unpacking; it ensures that an unpacking of the current row group ! This subroutine must be called before queuing the next row for unpacking; it ensures that an unpacking of the current row group
! occurs when the queue is full or when the next row belongs to another group ! occurs when the queue is full or when the next row belongs to another group
subroutine unpack_and_prepare_row_group_& subroutine unpack_and_prepare_row_group_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_gpu_& &_gpu_&
&PRECISION & &PRECISION &
(obj, row_group, row_group_dev, a_dev, stripe_count, stripe_width, & (obj, row_group, row_group_dev, a_dev, stripe_count, stripe_width, &
last_stripe_width, a_dim2, l_nev, row_group_size, nblk, & last_stripe_width, a_dim2, l_nev, row_group_size, nblk, &
unpack_idx, next_unpack_idx, force, wantDebug) unpack_idx, next_unpack_idx, force, wantDebug, allComputeOnGPU)
use, intrinsic :: iso_c_binding use, intrinsic :: iso_c_binding
use precision use precision
use gpu_c_kernel use gpu_c_kernel
use elpa_abstract_impl use elpa_abstract_impl
implicit none implicit none
class(elpa_abstract_impl_t), intent(inout) :: obj class(elpa_abstract_impl_t), intent(inout) :: obj
#if REALCASE == 1 #if REALCASE == 1
real(kind=C_DATATYPE_KIND) :: row_group(:,:) real(kind=C_DATATYPE_KIND) :: row_group(:,:)
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND) :: row_group(:,:) complex(kind=C_DATATYPE_KIND) :: row_group(:,:)
#endif #endif
integer(kind=c_intptr_t) :: row_group_dev, a_dev integer(kind=c_intptr_t) :: row_group_dev, a_dev
integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
integer(kind=ik), intent(inout) :: row_group_size integer(kind=ik), intent(inout) :: row_group_size
integer(kind=ik), intent(in) :: nblk integer(kind=ik), intent(in) :: nblk
integer(kind=ik), intent(inout) :: unpack_idx integer(kind=ik), intent(inout) :: unpack_idx
integer(kind=ik), intent(in) :: next_unpack_idx integer(kind=ik), intent(in) :: next_unpack_idx
logical, intent(in) :: force, wantDebug logical, intent(in) :: force, wantDebug, allComputeOnGPU
if (wantDebug) call obj%timer%start("unpack_and_prepare_row_group") if (wantDebug) call obj%timer%start("unpack_and_prepare_row_group")
if (row_group_size == 0) then if (row_group_size == 0) then
! Nothing to flush, just prepare for the upcoming row ! Nothing to flush, just prepare for the upcoming row
row_group_size = 1 row_group_size = 1
else else
if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /= next_unpack_idx)) then if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /= next_unpack_idx)) then
! A flush and a reset must be performed ! A flush and a reset must be performed
call unpack_row_group_& call unpack_row_group_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_gpu_& &_gpu_&
&PRECISION& &PRECISION&
(obj, row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, & (obj, row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &
a_dim2, l_nev, row_group(:, :), unpack_idx - row_group_size, row_group_size, & a_dim2, l_nev, row_group(:, :), unpack_idx - row_group_size, row_group_size, &
wantDebug) wantDebug, allComputeOnGPU)
row_group_size = 1 row_group_size = 1
else else
! Just prepare for the upcoming row ! Just prepare for the upcoming row
row_group_size = row_group_size + 1 row_group_size = row_group_size + 1
endif endif
endif endif
! Always update the index for the upcoming row ! Always update the index for the upcoming row
unpack_idx = next_unpack_idx unpack_idx = next_unpack_idx
if (wantDebug) call obj%timer%stop("unpack_and_prepare_row_group") if (wantDebug) call obj%timer%stop("unpack_and_prepare_row_group")
end subroutine end subroutine
! The host wrapper for extracting "tau" from the HH reflectors (see the kernel below) ! The host wrapper for extracting "tau" from the HH reflectors (see the kernel below)
subroutine extract_hh_tau_& subroutine extract_hh_tau_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_gpu_& &_gpu_&
&PRECISION& &PRECISION&
& (bcast_buffer_dev, hh_tau_dev, nbw, n, is_zero) & (bcast_buffer_dev, hh_tau_dev, nbw, n, is_zero)
use gpu_c_kernel use gpu_c_kernel
use precision use precision
use, intrinsic :: iso_c_binding use, intrinsic :: iso_c_binding
implicit none implicit none
integer(kind=c_intptr_t) :: bcast_buffer_dev, hh_tau_dev integer(kind=c_intptr_t) :: bcast_buffer_dev, hh_tau_dev
integer(kind=ik), value :: nbw, n integer(kind=ik), value :: nbw, n
logical, value :: is_zero logical, value :: is_zero
integer(kind=ik) :: val_is_zero integer(kind=ik) :: val_is_zero
if (is_zero) then if (is_zero) then
val_is_zero = 1 val_is_zero = 1
else else
val_is_zero = 0 val_is_zero = 0
endif endif
call launch_extract_hh_tau_gpu_kernel_& call launch_extract_hh_tau_gpu_kernel_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_& &_&
&PRECISION& &PRECISION&
& (bcast_buffer_dev, hh_tau_dev, nbw, n, val_is_zero) & (bcast_buffer_dev, hh_tau_dev, nbw, n, val_is_zero)
end subroutine end subroutine
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment