Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
c79d2841
Commit
c79d2841
authored
Jul 12, 2021
by
Andreas Marek
Browse files
New GPU coda path in trans_ev_tridi also for OpenMP only
parent
899f8870
Changes
2
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90
View file @
c79d2841
This diff is collapsed.
Click to expand it.
src/elpa2/pack_unpack_gpu.F90
View file @
c79d2841
...
...
@@ -55,7 +55,8 @@ subroutine pack_row_group_&
&
_
gpu_
&
&
PRECISION
&
(
obj
,
row_group_dev
,
a_dev
,
stripe_count
,
stripe_width
,
last_stripe_width
,
a_dim2
,
l_nev
,
&
rows
,
n_offset
,
row_count
,
result_buffer_dev
,
nblk
,
num_result_buffers
,
nbuf
,
doCopyResult
,
wantDebug
)
rows
,
n_offset
,
row_count
,
result_buffer_dev
,
nblk
,
num_result_buffers
,
nbuf
,
doCopyResult
,
&
wantDebug
,
allComputeOnGPU
)
use
gpu_c_kernel
use
elpa_gpu
use
elpa_abstract_impl
...
...
@@ -71,14 +72,13 @@ subroutine pack_row_group_&
real
(
kind
=
C_DATATYPE_KIND
)
::
rows
(:,:)
#endif
#if COMPLEXCASE == 1
complex
(
kind
=
C_DATATYPE_KIND
)
::
rows
(:,:)
complex
(
kind
=
C_DATATYPE_KIND
)
::
rows
(:,:)
#endif
integer
(
kind
=
ik
)
::
max_idx
logical
::
successGPU
logical
,
intent
(
in
)
::
doCopyResult
,
wantDebug
logical
,
intent
(
in
)
::
doCopyResult
,
wantDebug
,
allComputeOnGPU
integer
(
kind
=
ik
),
intent
(
in
)
::
nblk
,
nbuf
integer
(
kind
=
ik
),
intent
(
in
)
::
num_result_buffers
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
type
(
c_ptr
)
::
result_buffer_mpi_dev
#if REALCASE == 1
real
(
kind
=
C_DATATYPE_KIND
),
pointer
::
result_buffer_mpi_fortran_ptr
(:,:,:)
...
...
@@ -86,16 +86,17 @@ subroutine pack_row_group_&
#if COMPLEXCASE == 1
complex
(
kind
=
C_DATATYPE_KIND
),
pointer
::
result_buffer_mpi_fortran_ptr
(:,:,:)
#endif
#endif
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"pack_row_group"
)
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
! associate with c_ptr
result_buffer_mpi_dev
=
transfer
(
result_buffer_dev
,
result_buffer_mpi_dev
)
! and associate a fortran pointer
call
c_f_pointer
(
result_buffer_mpi_dev
,
result_buffer_mpi_fortran_ptr
,
&
[
l_nev
,
nblk
,
num_result_buffers
])
#endif
if
(
allComputeOnGPU
)
then
! associate with c_ptr
result_buffer_mpi_dev
=
transfer
(
result_buffer_dev
,
result_buffer_mpi_dev
)
! and associate a fortran pointer
call
c_f_pointer
(
result_buffer_mpi_dev
,
result_buffer_mpi_fortran_ptr
,
&
[
l_nev
,
nblk
,
num_result_buffers
])
endif
! Use many blocks for higher GPU occupancy
max_idx
=
(
stripe_count
-
1
)
*
stripe_width
+
last_stripe_width
...
...
@@ -108,29 +109,12 @@ subroutine pack_row_group_&
&
PRECISION
&
(
row_count
,
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
stripe_count
,
l_nev
,
a_dev
,
row_group_dev
)
#ifndef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
successGPU
=
gpu_memcpy
(
int
(
loc
(
rows
(:,
1
:
row_count
)),
kind
=
c_intptr_t
),
row_group_dev
,
row_count
*
l_nev
*
size_of_
&
&
PRECISION
&
&
_
&
&
MATH_DATATYPE
&
&
,
gpuMemcpyDeviceToHost
)
if
(
.not.
(
successGPU
))
then
print
*
,
"pack_row_group_&
&MATH_DATATYPE&
&_gpu_&
if
(
.not.
(
allComputeOnGPU
))
then
successGPU
=
gpu_memcpy
(
int
(
loc
(
rows
(:,
1
:
row_count
)),
kind
=
c_intptr_t
),
row_group_dev
,
row_count
*
l_nev
*
size_of_
&
&
PRECISION
&
&: error in cudaMemcpy"
stop
1
endif
#else
if
(
doCopyResult
)
then
! need to copy row_group_dev -> result_buffer_dev
successGPU
=
gpu_memcpy
(
c_loc
(
result_buffer_mpi_fortran_ptr
(
1
,
1
,
nbuf
)),
&
row_group_dev
,
row_count
*
l_nev
*
size_of_
&
&
PRECISION
&
&
_
&
&
MATH_DATATYPE
&
&
,
gpuMemcpyDeviceToDevice
)
&
_
&
&
MATH_DATATYPE
&
&
,
gpuMemcpyDeviceToHost
)
if
(
.not.
(
successGPU
))
then
print
*
,
"pack_row_group_&
&MATH_DATATYPE&
...
...
@@ -139,9 +123,26 @@ subroutine pack_row_group_&
&: error in cudaMemcpy"
stop
1
endif
else
! allComputeOnGPU
if
(
doCopyResult
)
then
! need to copy row_group_dev -> result_buffer_dev
successGPU
=
gpu_memcpy
(
c_loc
(
result_buffer_mpi_fortran_ptr
(
1
,
1
,
nbuf
)),
&
row_group_dev
,
row_count
*
l_nev
*
size_of_
&
&
PRECISION
&
&
_
&
&
MATH_DATATYPE
&
&
,
gpuMemcpyDeviceToDevice
)
if
(
.not.
(
successGPU
))
then
print
*
,
"pack_row_group_&
&MATH_DATATYPE&
&_gpu_&
&PRECISION&
&: error in cudaMemcpy"
stop
1
endif
endif
#
endif
endif
endif
! allComputeOnGPU
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"pack_row_group"
)
end
subroutine
...
...
@@ -149,162 +150,161 @@ end subroutine
! Unpack a filled row group (i.e. an array of consecutive rows)
subroutine
unpack_row_group_
&
&
MATH_DATATYPE
&
&
_
gpu_
&
&
PRECISION
&
(
obj
,
row_group_dev
,
a_dev
,
stripe_count
,
stripe_width
,
last_stripe_width
,
&
a_dim2
,
l_nev
,
rows
,
n_offset
,
row_count
,
wantDebug
)
use
gpu_c_kernel
use
elpa_abstract_impl
&
MATH_DATATYPE
&
&
_
gpu_
&
&
PRECISION
&
(
obj
,
row_group_dev
,
a_dev
,
stripe_count
,
stripe_width
,
last_stripe_width
,
&
a_dim2
,
l_nev
,
rows
,
n_offset
,
row_count
,
wantDebug
,
allComputeOnGPU
)
use
gpu_c_kernel
use
elpa_abstract_impl
use
precision
use
,
intrinsic
::
iso_c_binding
use
elpa_gpu
implicit
none
class
(
elpa_abstract_impl_t
),
intent
(
inout
)
::
obj
integer
(
kind
=
c_intptr_t
)
::
row_group_dev
,
a_dev
integer
(
kind
=
ik
),
intent
(
in
)
::
stripe_count
,
stripe_width
,
last_stripe_width
,
a_dim2
,
l_nev
integer
(
kind
=
ik
),
intent
(
in
)
::
n_offset
,
row_count
use
precision
use
,
intrinsic
::
iso_c_binding
use
elpa_gpu
implicit
none
class
(
elpa_abstract_impl_t
),
intent
(
inout
)
::
obj
integer
(
kind
=
c_intptr_t
)
::
row_group_dev
,
a_dev
integer
(
kind
=
ik
),
intent
(
in
)
::
stripe_count
,
stripe_width
,
last_stripe_width
,
a_dim2
,
l_nev
integer
(
kind
=
ik
),
intent
(
in
)
::
n_offset
,
row_count
#if REALCASE == 1
real
(
kind
=
C_DATATYPE_KIND
),
intent
(
in
)
::
rows
(:,
:)
real
(
kind
=
C_DATATYPE_KIND
),
intent
(
in
)
::
rows
(:,
:)
#endif
#if COMPLEXCASE == 1
complex
(
kind
=
C_DATATYPE_KIND
),
intent
(
in
)
::
rows
(:,
:)
complex
(
kind
=
C_DATATYPE_KIND
),
intent
(
in
)
::
rows
(:,
:)
#endif
integer
(
kind
=
ik
)
::
max_idx
logical
::
successGPU
logical
,
intent
(
in
)
::
wantDebug
integer
(
kind
=
ik
)
::
max_idx
logical
::
successGPU
logical
,
intent
(
in
)
::
wantDebug
,
allComputeOnGPU
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"unpack_row_group"
)
! Use many blocks for higher GPU occupancy
max_idx
=
(
stripe_count
-
1
)
*
stripe_width
+
last_stripe_width
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"unpack_row_group"
)
! Use many blocks for higher GPU occupancy
max_idx
=
(
stripe_count
-
1
)
*
stripe_width
+
last_stripe_width
#ifndef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
successGPU
=
gpu_memcpy
(
row_group_dev
,
int
(
loc
(
rows
(
1
,
1
)),
kind
=
c_intptr_t
),
row_count
*
l_nev
*
&
size_of_
&
&
PRECISION
&
&
_
&
&
MATH_DATATYPE
&
&,
gpuMemcpyHostToDevice
)
if
(
.not.
(
successGPU
))
then
if
(
.not.
(
allComputeOnGPU
))
then
successGPU
=
gpu_memcpy
(
row_group_dev
,
int
(
loc
(
rows
(
1
,
1
)),
kind
=
c_intptr_t
),
row_count
*
l_nev
*
&
size_of_
&
&
PRECISION
&
&
_
&
&
MATH_DATATYPE
&
&,
gpuMemcpyHostToDevice
)
if
(
.not.
(
successGPU
))
then
print
*
,
"unpack_row_group_&
&MATH_DATATYPE&
&_gpu_&
&PRECISION&
&: error in cudaMemcpy"
stop
1
endif
#
endif
endif
endif
! allComputeOnGPU
! only read access to row_group_dev
call
launch_my_unpack_gpu_kernel_
&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
(
row_count
,
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
stripe_count
,
l_nev
,
&
! only read access to row_group_dev
call
launch_my_unpack_gpu_kernel_
&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
(
row_count
,
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
stripe_count
,
l_nev
,
&
row_group_dev
,
a_dev
)
#ifdef WITH_CUDA_AWARE_MPI_TRANS_TRIDI_TO_BAND
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"cuda_aware_device_synchronize"
)
successGPU
=
gpu_devicesynchronize
()
if
(
.not.
(
successGPU
))
then
print
*
,
"unpack_row_group_&
&MATH_DATATYPE&
&_gpu_&
&PRECISION&
&: error in cudaMemcpy"
stop
1
endif
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"cuda_aware_device_synchronize"
)
#endif
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"unpack_row_group"
)
if
(
allComputeOnGPU
)
then
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"cuda_aware_device_synchronize"
)
successGPU
=
gpu_devicesynchronize
()
if
(
.not.
(
successGPU
))
then
print
*
,
"unpack_row_group_&
&MATH_DATATYPE&
&_gpu_&
&PRECISION&
&: error in cudaMemcpy"
stop
1
endif
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"cuda_aware_device_synchronize"
)
endif
! allComputeOnGPU
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"unpack_row_group"
)
end
subroutine
! This subroutine must be called before queuing the next row for unpacking; it ensures that an unpacking of the current row group
! occurs when the queue is full or when the next row belongs to another group
subroutine
unpack_and_prepare_row_group_
&
&
MATH_DATATYPE
&
&
_
gpu_
&
&
PRECISION
&
(
obj
,
row_group
,
row_group_dev
,
a_dev
,
stripe_count
,
stripe_width
,
&
last_stripe_width
,
a_dim2
,
l_nev
,
row_group_size
,
nblk
,
&
unpack_idx
,
next_unpack_idx
,
force
,
wantDebug
)
&
MATH_DATATYPE
&
&
_
gpu_
&
&
PRECISION
&
(
obj
,
row_group
,
row_group_dev
,
a_dev
,
stripe_count
,
stripe_width
,
&
last_stripe_width
,
a_dim2
,
l_nev
,
row_group_size
,
nblk
,
&
unpack_idx
,
next_unpack_idx
,
force
,
wantDebug
,
allComputeOnGPU
)
use
,
intrinsic
::
iso_c_binding
use
precision
use
gpu_c_kernel
use
elpa_abstract_impl
use
,
intrinsic
::
iso_c_binding
use
precision
use
gpu_c_kernel
use
elpa_abstract_impl
implicit
none
implicit
none
class
(
elpa_abstract_impl_t
),
intent
(
inout
)
::
obj
class
(
elpa_abstract_impl_t
),
intent
(
inout
)
::
obj
#if REALCASE == 1
real
(
kind
=
C_DATATYPE_KIND
)
::
row_group
(:,:)
real
(
kind
=
C_DATATYPE_KIND
)
::
row_group
(:,:)
#endif
#if COMPLEXCASE == 1
complex
(
kind
=
C_DATATYPE_KIND
)
::
row_group
(:,:)
complex
(
kind
=
C_DATATYPE_KIND
)
::
row_group
(:,:)
#endif
integer
(
kind
=
c_intptr_t
)
::
row_group_dev
,
a_dev
integer
(
kind
=
ik
),
intent
(
in
)
::
stripe_count
,
stripe_width
,
last_stripe_width
,
a_dim2
,
l_nev
integer
(
kind
=
ik
),
intent
(
inout
)
::
row_group_size
integer
(
kind
=
ik
),
intent
(
in
)
::
nblk
integer
(
kind
=
ik
),
intent
(
inout
)
::
unpack_idx
integer
(
kind
=
ik
),
intent
(
in
)
::
next_unpack_idx
logical
,
intent
(
in
)
::
force
,
wantDebug
integer
(
kind
=
c_intptr_t
)
::
row_group_dev
,
a_dev
integer
(
kind
=
ik
),
intent
(
in
)
::
stripe_count
,
stripe_width
,
last_stripe_width
,
a_dim2
,
l_nev
integer
(
kind
=
ik
),
intent
(
inout
)
::
row_group_size
integer
(
kind
=
ik
),
intent
(
in
)
::
nblk
integer
(
kind
=
ik
),
intent
(
inout
)
::
unpack_idx
integer
(
kind
=
ik
),
intent
(
in
)
::
next_unpack_idx
logical
,
intent
(
in
)
::
force
,
wantDebug
,
allComputeOnGPU
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"unpack_and_prepare_row_group"
)
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"unpack_and_prepare_row_group"
)
if
(
row_group_size
==
0
)
then
! Nothing to flush, just prepare for the upcoming row
row_group_size
=
1
else
if
(
force
.or.
(
row_group_size
==
nblk
)
.or.
(
unpack_idx
+
1
/
=
next_unpack_idx
))
then
! A flush and a reset must be performed
call
unpack_row_group_
&
&
MATH_DATATYPE
&
&
_
gpu_
&
&
PRECISION
&
(
obj
,
row_group_dev
,
a_dev
,
stripe_count
,
stripe_width
,
last_stripe_width
,
&
a_dim2
,
l_nev
,
row_group
(:,
:),
unpack_idx
-
row_group_size
,
row_group_size
,
&
wantDebug
)
row_group_size
=
1
else
! Just prepare for the upcoming row
row_group_size
=
row_group_size
+
1
endif
endif
! Always update the index for the upcoming row
unpack_idx
=
next_unpack_idx
if
(
row_group_size
==
0
)
then
! Nothing to flush, just prepare for the upcoming row
row_group_size
=
1
else
if
(
force
.or.
(
row_group_size
==
nblk
)
.or.
(
unpack_idx
+
1
/
=
next_unpack_idx
))
then
! A flush and a reset must be performed
call
unpack_row_group_
&
&
MATH_DATATYPE
&
&
_
gpu_
&
&
PRECISION
&
(
obj
,
row_group_dev
,
a_dev
,
stripe_count
,
stripe_width
,
last_stripe_width
,
&
a_dim2
,
l_nev
,
row_group
(:,
:),
unpack_idx
-
row_group_size
,
row_group_size
,
&
wantDebug
,
allComputeOnGPU
)
row_group_size
=
1
else
! Just prepare for the upcoming row
row_group_size
=
row_group_size
+
1
endif
endif
! Always update the index for the upcoming row
unpack_idx
=
next_unpack_idx
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"unpack_and_prepare_row_group"
)
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"unpack_and_prepare_row_group"
)
end
subroutine
! The host wrapper for extracting "tau" from the HH reflectors (see the kernel below)
subroutine
extract_hh_tau_
&
&
MATH_DATATYPE
&
&
_
gpu_
&
&
PRECISION
&
&
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
is_zero
)
use
gpu_c_kernel
use
precision
use
,
intrinsic
::
iso_c_binding
implicit
none
integer
(
kind
=
c_intptr_t
)
::
bcast_buffer_dev
,
hh_tau_dev
integer
(
kind
=
ik
),
value
::
nbw
,
n
logical
,
value
::
is_zero
integer
(
kind
=
ik
)
::
val_is_zero
if
(
is_zero
)
then
val_is_zero
=
1
else
val_is_zero
=
0
endif
&
MATH_DATATYPE
&
&
_
gpu_
&
&
PRECISION
&
&
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
is_zero
)
use
gpu_c_kernel
use
precision
use
,
intrinsic
::
iso_c_binding
implicit
none
integer
(
kind
=
c_intptr_t
)
::
bcast_buffer_dev
,
hh_tau_dev
integer
(
kind
=
ik
),
value
::
nbw
,
n
logical
,
value
::
is_zero
integer
(
kind
=
ik
)
::
val_is_zero
if
(
is_zero
)
then
val_is_zero
=
1
else
val_is_zero
=
0
endif
call
launch_extract_hh_tau_gpu_kernel_
&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
&
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
val_is_zero
)
call
launch_extract_hh_tau_gpu_kernel_
&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
&
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
val_is_zero
)
end
subroutine
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment