Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
1151f6a5
Commit
1151f6a5
authored
Jan 22, 2016
by
Andreas Marek
Browse files
Move GPU part "contains" functions in separate module
parent
a0934d4e
Changes
3
Hide whitespace changes
Inline
Side-by-side
Makefile.am
View file @
1151f6a5
...
...
@@ -18,6 +18,7 @@ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \
src/mod_cuda.F90
\
src/interface_c_kernel.F90
\
src/mod_pack_unpack_real.F90
\
src/mod_pack_unpack_real_gpu.F90
\
src/elpa2_kernels/mod_single_hh_trafo_real.F90
\
src/mod_compute_hh_trafo_real.F90
\
src/mod_compute_hh_trafo_complex.F90
\
...
...
src/elpa2_compute.F90
View file @
1151f6a5
...
...
@@ -2475,6 +2475,7 @@ module ELPA2_compute
use
cuda_functions
use
precision
use
pack_unpack_real
use
pack_unpack_real_gpu
use
compute_hh_trafo_real
implicit
none
logical
,
intent
(
in
)
::
useGPU
...
...
@@ -2796,7 +2797,9 @@ module ELPA2_compute
if
(
useGPU
)
then
! An unpacking of the current row group may occur before queuing the next row
call
unpack_and_prepare_row_group_real_gpu
(
i
-
limits
(
ip
),
.false.
)
call
unpack_and_prepare_row_group_real_gpu
(
row_group
,
row_group_dev
,
a_dev
,
stripe_count
,
&
stripe_width
,
last_stripe_width
,
a_dim2
,
l_nev
,&
row_group_size
,
nblk
,
unpack_idx
,
i
-
limits
(
ip
),
.false.
)
call
MPI_Recv
(
row_group
(:,
row_group_size
),
l_nev
,
MPI_REAL8
,
src
,
0
,
mpi_comm_rows
,
MPI_STATUS_IGNORE
,
mpierr
)
else
call
MPI_Recv
(
row
,
l_nev
,
MPI_REAL8
,
src
,
0
,
mpi_comm_rows
,
MPI_STATUS_IGNORE
,
mpierr
)
...
...
@@ -2832,7 +2835,9 @@ module ELPA2_compute
#else /* WITH_OPENMP */
if
(
useGPU
)
then
! An unpacking of the current row group may occur before queuing the next row
call
unpack_and_prepare_row_group_real_gpu
(
i
-
limits
(
ip
),
.false.
)
call
unpack_and_prepare_row_group_real_gpu
(
row_group
,
row_group_dev
,
a_dev
,
stripe_count
,
&
stripe_width
,
last_stripe_width
,
a_dim2
,
l_nev
,&
row_group_size
,
nblk
,
unpack_idx
,
i
-
limits
(
ip
),
.false.
)
row_group
(:,
row_group_size
)
=
q
(
src_offset
,
1
:
l_nev
)
else
call
unpack_row_real_cpu
(
a
,
row
,
i
-
limits
(
ip
),
stripe_count
,
stripe_width
,
last_stripe_width
)
...
...
@@ -2889,7 +2894,9 @@ module ELPA2_compute
#else /* WITH_OPENMP */
if
(
useGPU
)
then
! An unpacking of the current row group may occur before queuing the next row
call
unpack_and_prepare_row_group_real_gpu
(
i
-
limits
(
my_prow
),
.false.
)
call
unpack_and_prepare_row_group_real_gpu
(
row_group
,
row_group_dev
,
a_dev
,
stripe_count
,
stripe_width
,
&
last_stripe_width
,
a_dim2
,
l_nev
,
row_group_size
,
nblk
,
&
unpack_idx
,
i
-
limits
(
my_prow
),
.false.
)
call
MPI_Recv
(
row_group
(:,
row_group_size
),
l_nev
,
MPI_REAL8
,
src
,
0
,
mpi_comm_rows
,
MPI_STATUS_IGNORE
,
mpierr
)
else
call
MPI_Recv
(
row
,
l_nev
,
MPI_REAL8
,
src
,
0
,
mpi_comm_rows
,
MPI_STATUS_IGNORE
,
mpierr
)
...
...
@@ -2905,7 +2912,8 @@ module ELPA2_compute
if
(
useGPU
)
then
! Force an unpacking of all remaining rows that haven't been unpacked yet
call
unpack_and_prepare_row_group_real_gpu
(
-1
,
.true.
)
call
unpack_and_prepare_row_group_real_gpu
(
row_group
,
row_group_dev
,
a_dev
,
stripe_count
,
stripe_width
,
last_stripe_width
,
&
a_dim2
,
l_nev
,
row_group_size
,
nblk
,
unpack_idx
,
-1
,
.true.
)
successCUDA
=
cuda_devicesynchronize
()
if
(
.not.
(
successCUDA
))
then
...
...
@@ -3160,8 +3168,8 @@ module ELPA2_compute
stop
endif
call
extract_hh_tau_real_gpu
(
nbw
,
current_local_n
,
.false.
)
call
compute_hh_dot_products_real_gpu
(
nbw
,
current_local_n
)
call
extract_hh_tau_real_gpu
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
current_local_n
,
.false.
)
call
compute_hh_dot_products_real_gpu
(
bcast_buffer_dev
,
hh_dot_dev
,
nbw
,
current_local_n
)
endif
else
...
...
@@ -3175,7 +3183,7 @@ module ELPA2_compute
stop
endif
call
extract_hh_tau_real_gpu
(
nbw
,
1
,
.true.
)
call
extract_hh_tau_real_gpu
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
1
,
.true.
)
endif
endif
...
...
@@ -3644,7 +3652,8 @@ module ELPA2_compute
if
(
dst
==
0
)
then
if
(
useGPU
)
then
row_group_size
=
min
(
na
-
num_blk
*
nblk
,
nblk
)
call
pack_row_group_real_gpu
(
row_group
(:,
:),
j
*
nblk
+
a_off
,
row_group_size
)
call
pack_row_group_real_gpu
(
row_group_dev
,
a_dev
,
stripe_count
,
stripe_width
,
last_stripe_width
,
a_dim2
,
l_nev
,
&
row_group
(:,
:),
j
*
nblk
+
a_off
,
row_group_size
)
do
i
=
1
,
row_group_size
q
((
num_blk
/
np_rows
)
*
nblk
+
i
,
1
:
l_nev
)
=
row_group
(:,
i
)
...
...
@@ -3652,13 +3661,14 @@ module ELPA2_compute
else
do
i
=
1
,
min
(
na
-
num_blk
*
nblk
,
nblk
)
call
pack_row_real_cpu
(
row
,
j
*
nblk
+
i
+
a_off
)
call
pack_row_real_cpu
(
a
,
row
,
j
*
nblk
+
i
+
a_off
,
stripe_width
,
last_stripe_width
,
stripe_count
)
q
((
num_blk
/
np_rows
)
*
nblk
+
i
,
1
:
l_nev
)
=
row
(:)
enddo
endif
else
if
(
useGPU
)
then
call
pack_row_group_real_gpu
(
result_buffer
(:,
:,
nbuf
),
j
*
nblk
+
a_off
,
nblk
)
call
pack_row_group_real_gpu
(
row_group_dev
,
a_dev
,
stripe_count
,
stripe_width
,
last_stripe_width
,
a_dim2
,
l_nev
,
&
result_buffer
(:,
:,
nbuf
),
j
*
nblk
+
a_off
,
nblk
)
else
do
i
=
1
,
nblk
call
pack_row_real_cpu
(
a
,
result_buffer
(:,
i
,
nbuf
),
j
*
nblk
+
i
+
a_off
,
stripe_width
,
last_stripe_width
,
stripe_count
)
...
...
@@ -3962,168 +3972,7 @@ module ELPA2_compute
#ifdef HAVE_DETAILED_TIMINGS
call
timer
%
stop
(
"trans_ev_tridi_to_band_real"
)
#endif
return
contains
! Pack a filled row group (i.e. an array of consecutive rows)
subroutine
pack_row_group_real_gpu
(
rows
,
n_offset
,
row_count
)
use
cuda_c_kernel
use
precision
implicit
none
integer
(
kind
=
ik
),
intent
(
in
)
::
n_offset
,
row_count
real
(
kind
=
rk
)
::
rows
(:,:)
integer
(
kind
=
ik
)
::
max_idx
logical
::
successCUDA
! Use many blocks for higher GPU occupancy
max_idx
=
(
stripe_count
-
1
)
*
stripe_width
+
last_stripe_width
! Use one kernel call to pack the entire row group
! call my_pack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, a_dev, row_group_dev)
call
launch_my_pack_c_kernel_real
(
row_count
,
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
stripe_count
,
&
l_nev
,
a_dev
,
row_group_dev
)
! Issue one single transfer call for all rows (device to host)
! rows(:, 1 : row_count) = row_group_dev(:, 1 : row_count)
successCUDA
=
cuda_memcpy
(
loc
(
rows
(:,
1
:
row_count
)),
row_group_dev
,
row_count
*
l_nev
*
size_of_real_datatype
,
&
cudaMemcpyDeviceToHost
)
if
(
.not.
(
successCUDA
))
then
print
*
,
"pack_row_group_real_gpu: error in cudaMemcpy"
stop
endif
!write(*,*) cudaGetErrorString(istat)
end
subroutine
! Unpack a filled row group (i.e. an array of consecutive rows)
subroutine
unpack_row_group_real_gpu
(
rows
,
n_offset
,
row_count
)
use
cuda_c_kernel
use
precision
implicit
none
integer
(
kind
=
ik
),
intent
(
in
)
::
n_offset
,
row_count
real
(
kind
=
rk
),
intent
(
in
)
::
rows
(:,
:)
integer
(
kind
=
ik
)
::
max_idx
integer
(
kind
=
ik
)
::
i
logical
::
successCUA
! Use many blocks for higher GPU occupancy
max_idx
=
(
stripe_count
-
1
)
*
stripe_width
+
last_stripe_width
! Issue one single transfer call for all rows (host to device)
! row_group_dev(:, 1 : row_count) = rows(:, 1 : row_count)
!istat = cuda_memcpy( row_group_dev , loc(rows(:, 1: row_count)),row_count * l_nev * size_of_real_datatype , &
! cudaMemcpyHostToDevice)
successCUDA
=
cuda_memcpy
(
row_group_dev
,
loc
(
rows
(
1
,
1
)),
row_count
*
l_nev
*
&
size_of_real_datatype
,
cudaMemcpyHostToDevice
)
if
(
.not.
(
successCUDA
))
then
print
*
,
"unpack_row_group_real_gpu: error in cudaMemcpy"
stop
endif
!write(*,*) cudaGetErrorString(istat)
! Use one kernel call to pack the entire row group
! call my_unpack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, row_group_dev, a_dev)
call
launch_my_unpack_c_kernel_real
(
row_count
,
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
stripe_count
,
l_nev
,
&
row_group_dev
,
a_dev
)
end
subroutine
! This subroutine must be called before queuing the next row for unpacking; it ensures that an unpacking of the current row group
! occurs when the queue is full or when the next row belongs to another group
subroutine
unpack_and_prepare_row_group_real_gpu
(
next_unpack_idx
,
force
)
use
precision
implicit
none
integer
(
kind
=
ik
),
intent
(
in
)
::
next_unpack_idx
logical
,
intent
(
in
)
::
force
if
(
row_group_size
==
0
)
then
! Nothing to flush, just prepare for the upcoming row
row_group_size
=
1
else
if
(
force
.or.
(
row_group_size
==
nblk
)
.or.
(
unpack_idx
+
1
/
=
next_unpack_idx
))
then
! A flush and a reset must be performed
call
unpack_row_group_real_gpu
(
row_group
(:,
:),
unpack_idx
-
row_group_size
,
row_group_size
)
row_group_size
=
1
else
! Just prepare for the upcoming row
row_group_size
=
row_group_size
+
1
endif
endif
! Always update the index for the upcoming row
unpack_idx
=
next_unpack_idx
end
subroutine
! The host wrapper for computing the dot products between consecutive HH reflectors (see the kernel below)
subroutine
compute_hh_dot_products_real_gpu
(
nbw
,
n
)
use
cuda_c_kernel
use
precision
implicit
none
integer
(
kind
=
ik
),
value
::
nbw
,
n
if
(
n
.le.
1
)
return
call
launch_compute_hh_dotp_c_kernel_real
(
bcast_buffer_dev
,
hh_dot_dev
,
nbw
,
n
)
end
subroutine
! The host wrapper for extracting "tau" from the HH reflectors (see the kernel below)
subroutine
extract_hh_tau_real_gpu
(
nbw
,
n
,
is_zero
)
use
cuda_c_kernel
use
precision
implicit
none
integer
(
kind
=
ik
),
value
::
nbw
,
n
logical
,
value
::
is_zero
integer
(
kind
=
ik
)
::
val_is_zero
if
(
is_zero
)
then
val_is_zero
=
1
else
val_is_zero
=
0
endif
call
launch_extract_hh_tau_c_kernel_real
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
val_is_zero
)
end
subroutine
! -------------------------------------------
! Fortran back-transformation support kernels
! -------------------------------------------
! Reset a reduction block
! Limitation: the thread-block size must be a divider of the reduction block's size
! Reset 2 reduction blocks without an explicit synchronization at the end
! Limitation: : the thread-block size must be a divider of the reduction block's size
! Perform a reduction on an initialized, 128-element shared block
! Compute the dot-product between 2 consecutive HH vectors
! Limitation 1: the size of the thread block must be at most 128 and a power-of-2
! Limitation 2: the size of the warp must be equal to 32
!
! Extract "tau" from the HH matrix and replace it with 1.0 or 0.0 (depending on case)
! Having "tau" as the first element in a HH reflector reduces space requirements, but causes undesired branching in the kernels
!
! -------------------------------------------
! Fortran back-transformation support kernels
! -------------------------------------------
!
! This is the simplest and slowest available backtransformation kernel
!
! This is an improved version of the simple backtransformation kernel; here, we halve the number of iterations and apply
! 2 Householder reflectors per iteration
!
! ---------------------------------
! Row packing and unpacking kernels
! ---------------------------------
!
! The row group packing kernel
! Host wrapper for the Householder backtransformation step. Several kernels are available. Performance note:
! - "compute_hh_trafo_c_kernel" is the C kernel for the backtransformation (this exhibits best performance)
! - "compute_hh_trafo_kernel" is the Fortran equivalent of the C kernel
! - "compute_hh_trafo_single_kernel" is the reference Fortran kernel
return
end
subroutine
trans_ev_tridi_to_band_real
...
...
@@ -5187,21 +5036,21 @@ module ELPA2_compute
! Q = Q - V * T**T * V**T * Q
if
(
l_rows
>
0
)
then
if
(
l_rows
>
0
)
then
if
(
useGPU
)
then
call
cublas_zgemm
(
'C'
,
'N'
,
n_cols
,
l_cols
,
l_rows
,
CONE
,
hvm_dev
,
max_local_rows
,
&
q_dev
,
ldq
,
CZERO
,
tmp_dev
,
n_cols
)
successCUDA
=
cuda_memcpy
(
loc
(
tmp1
),
tmp_dev
,
n_cols
*
l_cols
*
size_of_complex_datatype
,
&
cudaMemcpyDeviceToHost
)
if
(
.not.
(
successCUDA
))
then
print
*
,
"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
endif
if
(
.not.
(
successCUDA
))
then
print
*
,
"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
endif
else
call
zgemm
(
'C'
,
'N'
,
n_cols
,
l_cols
,
l_rows
,
CONE
,
hvm
,
ubound
(
hvm
,
dim
=
1
),
&
q
,
ldq
,
CZERO
,
tmp1
,
n_cols
)
call
zgemm
(
'C'
,
'N'
,
n_cols
,
l_cols
,
l_rows
,
CONE
,
hvm
,
ubound
(
hvm
,
dim
=
1
),
&
q
,
ldq
,
CZERO
,
tmp1
,
n_cols
)
endif
else
else
! l_rows > 0
if
(
useGPU
)
then
if
(
l_cols
*
n_cols
.gt.
(
max_local_cols
)
*
(
nbw
))
then
print
*
,
"trans_ev_band_to_full_complex: tmp_dev "
,
l_cols
*
n_cols
,
max_local_cols
...
...
@@ -5308,7 +5157,7 @@ module ELPA2_compute
!if (istat .ne. 0) then
!print *,"trans_ev_band_to_full_complex: error when deallocating tmat_temp "//errorMessage
!endif
endif
endif
! use GPU
#ifdef HAVE_DETAILED_TIMINGS
call
timer
%
stop
(
"trans_ev_band_to_full_complex"
)
#endif
...
...
@@ -7775,7 +7624,12 @@ module ELPA2_compute
enddo
else
do
i
=
1
,
min
(
na
-
num_blk
*
nblk
,
nblk
)
call
pack_row_complex_cpu
(
row
,
j
*
nblk
+
i
+
a_off
)
#ifdef WITH_OPENMP
call
pack_row_complex_cpu_openmp
(
a
,
row
,
j
*
nblk
+
i
+
a_off
,
&
stripe_width
,
stripe_count
,
max_threads
,
thread_width
,
l_nev
)
#else
call
pack_row_complex_cpu
(
a
,
row
,
j
*
nblk
+
i
+
a_off
,
stripe_width
,
last_stripe_width
,
stripe_count
)
#endif
q
((
num_blk
/
np_rows
)
*
nblk
+
i
,
1
:
l_nev
)
=
row
(:)
enddo
endif
...
...
@@ -7784,7 +7638,13 @@ module ELPA2_compute
call
pack_row_group_complex_gpu
(
result_buffer
(:,
:,
nbuf
),
j
*
nblk
+
a_off
,
nblk
)
else
do
i
=
1
,
nblk
call
pack_row_complex_cpu
(
a
,
row
,
j
*
nblk
+
i
+
a_off
,
stripe_width
,
last_stripe_width
,
stripe_count
)
#ifdef WITH_OPENMP
call
pack_row_complex_cpu_openmp
(
a
,
result_buffer
(:,
i
,
nbuf
),
j
*
nblk
+
i
+
a_off
,
&
stripe_width
,
stripe_count
,
max_threads
,
thread_width
,
l_nev
)
#else
call
pack_row_complex_cpu
(
a
,
result_buffer
(:,
i
,
nbuf
),
j
*
nblk
+
i
+
a_off
,
stripe_width
,
&
last_stripe_width
,
stripe_count
)
#endif
enddo
endif
...
...
src/mod_pack_unpack_real_gpu.F90
0 → 100644
View file @
1151f6a5
module
pack_unpack_real_gpu
#include "config-f90.h"
implicit
none
public
pack_row_group_real_gpu
,
unpack_row_group_real_gpu
,
&
unpack_and_prepare_row_group_real_gpu
,
compute_hh_dot_products_real_gpu
,
&
extract_hh_tau_real_gpu
contains
! Pack a filled row group (i.e. an array of consecutive rows)
subroutine
pack_row_group_real_gpu
(
row_group_dev
,
a_dev
,
stripe_count
,
stripe_width
,
last_stripe_width
,
a_dim2
,
l_nev
,
&
rows
,
n_offset
,
row_count
)
use
cuda_c_kernel
use
cuda_functions
use
precision
use
iso_c_binding
implicit
none
integer
(
kind
=
c_intptr_t
)
::
row_group_dev
,
a_dev
integer
(
kind
=
ik
),
intent
(
in
)
::
stripe_count
,
stripe_width
,
last_stripe_width
,
a_dim2
,
l_nev
integer
(
kind
=
ik
),
intent
(
in
)
::
n_offset
,
row_count
real
(
kind
=
rk
)
::
rows
(:,:)
integer
(
kind
=
ik
)
::
max_idx
logical
::
successCUDA
! Use many blocks for higher GPU occupancy
max_idx
=
(
stripe_count
-
1
)
*
stripe_width
+
last_stripe_width
! Use one kernel call to pack the entire row group
! call my_pack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, a_dev, row_group_dev)
call
launch_my_pack_c_kernel_real
(
row_count
,
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
stripe_count
,
&
l_nev
,
a_dev
,
row_group_dev
)
! Issue one single transfer call for all rows (device to host)
! rows(:, 1 : row_count) = row_group_dev(:, 1 : row_count)
successCUDA
=
cuda_memcpy
(
loc
(
rows
(:,
1
:
row_count
)),
row_group_dev
,
row_count
*
l_nev
*
size_of_real_datatype
,
&
cudaMemcpyDeviceToHost
)
if
(
.not.
(
successCUDA
))
then
print
*
,
"pack_row_group_real_gpu: error in cudaMemcpy"
stop
endif
!write(*,*) cudaGetErrorString(istat)
end
subroutine
! Unpack a filled row group (i.e. an array of consecutive rows)
subroutine
unpack_row_group_real_gpu
(
row_group_dev
,
a_dev
,
stripe_count
,
stripe_width
,
last_stripe_width
,
&
a_dim2
,
l_nev
,
rows
,
n_offset
,
row_count
)
use
cuda_c_kernel
use
precision
use
iso_c_binding
use
cuda_functions
implicit
none
integer
(
kind
=
c_intptr_t
)
::
row_group_dev
,
a_dev
integer
(
kind
=
ik
),
intent
(
in
)
::
stripe_count
,
stripe_width
,
last_stripe_width
,
a_dim2
,
l_nev
integer
(
kind
=
ik
),
intent
(
in
)
::
n_offset
,
row_count
real
(
kind
=
rk
),
intent
(
in
)
::
rows
(:,
:)
integer
(
kind
=
ik
)
::
max_idx
integer
(
kind
=
ik
)
::
i
logical
::
successCUDA
! Use many blocks for higher GPU occupancy
max_idx
=
(
stripe_count
-
1
)
*
stripe_width
+
last_stripe_width
! Issue one single transfer call for all rows (host to device)
! row_group_dev(:, 1 : row_count) = rows(:, 1 : row_count)
!istat = cuda_memcpy( row_group_dev , loc(rows(:, 1: row_count)),row_count * l_nev * size_of_real_datatype , &
! cudaMemcpyHostToDevice)
successCUDA
=
cuda_memcpy
(
row_group_dev
,
loc
(
rows
(
1
,
1
)),
row_count
*
l_nev
*
&
size_of_real_datatype
,
cudaMemcpyHostToDevice
)
if
(
.not.
(
successCUDA
))
then
print
*
,
"unpack_row_group_real_gpu: error in cudaMemcpy"
stop
endif
!write(*,*) cudaGetErrorString(istat)
! Use one kernel call to pack the entire row group
! call my_unpack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, row_group_dev, a_dev)
call
launch_my_unpack_c_kernel_real
(
row_count
,
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
stripe_count
,
l_nev
,
&
row_group_dev
,
a_dev
)
end
subroutine
! This subroutine must be called before queuing the next row for unpacking; it ensures that an unpacking of the current row group
! occurs when the queue is full or when the next row belongs to another group
subroutine
unpack_and_prepare_row_group_real_gpu
(
row_group
,
row_group_dev
,
a_dev
,
stripe_count
,
stripe_width
,
&
last_stripe_width
,
a_dim2
,
l_nev
,
row_group_size
,
nblk
,
&
unpack_idx
,
next_unpack_idx
,
force
)
use
iso_c_binding
use
precision
implicit
none
real
(
kind
=
rk
)
::
row_group
(:,:)
integer
(
kind
=
c_intptr_t
)
::
row_group_dev
,
a_dev
integer
(
kind
=
ik
),
intent
(
in
)
::
stripe_count
,
stripe_width
,
last_stripe_width
,
a_dim2
,
l_nev
integer
(
kind
=
ik
),
intent
(
inout
)
::
row_group_size
integer
(
kind
=
ik
),
intent
(
in
)
::
nblk
integer
(
kind
=
ik
),
intent
(
inout
)
::
unpack_idx
integer
(
kind
=
ik
),
intent
(
in
)
::
next_unpack_idx
logical
,
intent
(
in
)
::
force
if
(
row_group_size
==
0
)
then
! Nothing to flush, just prepare for the upcoming row
row_group_size
=
1
else
if
(
force
.or.
(
row_group_size
==
nblk
)
.or.
(
unpack_idx
+
1
/
=
next_unpack_idx
))
then
! A flush and a reset must be performed
call
unpack_row_group_real_gpu
(
row_group_dev
,
a_dev
,
stripe_count
,
stripe_width
,
last_stripe_width
,
&
a_dim2
,
l_nev
,
row_group
(:,
:),
unpack_idx
-
row_group_size
,
row_group_size
)
row_group_size
=
1
else
! Just prepare for the upcoming row
row_group_size
=
row_group_size
+
1
endif
endif
! Always update the index for the upcoming row
unpack_idx
=
next_unpack_idx
end
subroutine
! The host wrapper for computing the dot products between consecutive HH reflectors (see the kernel below)
subroutine
compute_hh_dot_products_real_gpu
(
bcast_buffer_dev
,
hh_dot_dev
,
nbw
,
n
)
use
cuda_c_kernel
use
precision
use
iso_c_binding
implicit
none
integer
(
kind
=
c_intptr_t
)
::
bcast_buffer_dev
,
hh_dot_dev
integer
(
kind
=
ik
),
value
::
nbw
,
n
if
(
n
.le.
1
)
return
call
launch_compute_hh_dotp_c_kernel_real
(
bcast_buffer_dev
,
hh_dot_dev
,
nbw
,
n
)
end
subroutine
! The host wrapper for extracting "tau" from the HH reflectors (see the kernel below)
subroutine
extract_hh_tau_real_gpu
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
is_zero
)
use
cuda_c_kernel
use
precision
use
iso_c_binding
implicit
none
integer
(
kind
=
c_intptr_t
)
::
bcast_buffer_dev
,
hh_tau_dev
integer
(
kind
=
ik
),
value
::
nbw
,
n
logical
,
value
::
is_zero
integer
(
kind
=
ik
)
::
val_is_zero
if
(
is_zero
)
then
val_is_zero
=
1
else
val_is_zero
=
0
endif
call
launch_extract_hh_tau_c_kernel_real
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
val_is_zero
)
end
subroutine
! -------------------------------------------
! Fortran back-transformation support kernels
! -------------------------------------------
! Reset a reduction block
! Limitation: the thread-block size must be a divider of the reduction block's size
! Reset 2 reduction blocks without an explicit synchronization at the end
! Limitation: : the thread-block size must be a divider of the reduction block's size
! Perform a reduction on an initialized, 128-element shared block
! Compute the dot-product between 2 consecutive HH vectors
! Limitation 1: the size of the thread block must be at most 128 and a power-of-2
! Limitation 2: the size of the warp must be equal to 32
!
! Extract "tau" from the HH matrix and replace it with 1.0 or 0.0 (depending on case)
! Having "tau" as the first element in a HH reflector reduces space requirements, but causes undesired branching in the kernels
!
! -------------------------------------------
! Fortran back-transformation support kernels
! -------------------------------------------
!
! This is the simplest and slowest available backtransformation kernel
!
! This is an improved version of the simple backtransformation kernel; here, we halve the number of iterations and apply
! 2 Householder reflectors per iteration
!
! ---------------------------------
! Row packing and unpacking kernels
! ---------------------------------
!
! The row group packing kernel
! Host wrapper for the Householder backtransformation step. Several kernels are available. Performance note:
! - "compute_hh_trafo_c_kernel" is the C kernel for the backtransformation (this exhibits best performance)
! - "compute_hh_trafo_kernel" is the Fortran equivalent of the C kernel
! - "compute_hh_trafo_single_kernel" is the reference Fortran kernel
end
module
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment