Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
f515b7b2
Commit
f515b7b2
authored
Mar 01, 2021
by
Andreas Marek
Browse files
Rename successCUDA -> successGPU
parent
f7a2f3d2
Changes
11
Hide whitespace changes
Inline
Side-by-side
src/elpa1/elpa1_trans_ev_template.F90
View file @
f515b7b2
...
...
@@ -139,7 +139,7 @@ subroutine trans_ev_&
integer
(
kind
=
c_intptr_t
)
::
num
integer
(
kind
=
C_intptr_T
)
::
q_dev
,
tmp_dev
,
hvm_dev
,
tmat_dev
integer
(
kind
=
ik
)
::
blockStep
logical
::
success
CUDA
logical
::
success
GPU
integer
(
kind
=
c_intptr_t
),
parameter
::
size_of_datatype
=
size_of_
&
&
PRECISION
&
&
_
&
...
...
@@ -242,45 +242,45 @@ subroutine trans_ev_&
!&MATH_DATATYPE&
!&", "hvm1", istat, errorMessage)
num
=
(
max_local_rows
*
max_stored_rows
)
*
size_of_datatype
success
CUDA
=
gpu_malloc_host
(
hvm1_host
,
num
)
check_alloc_cuda
(
"trans_ev: hvm1_host"
,
success
CUDA
)
success
GPU
=
gpu_malloc_host
(
hvm1_host
,
num
)
check_alloc_cuda
(
"trans_ev: hvm1_host"
,
success
GPU
)
call
c_f_pointer
(
hvm1_host
,
hvm1
,(/(
max_local_rows
*
max_stored_rows
)/))
num
=
(
max_stored_rows
*
max_stored_rows
)
*
size_of_datatype
success
CUDA
=
gpu_malloc_host
(
tmat_host
,
num
)
check_alloc_cuda
(
"trans_ev: tmat_host"
,
success
CUDA
)
success
GPU
=
gpu_malloc_host
(
tmat_host
,
num
)
check_alloc_cuda
(
"trans_ev: tmat_host"
,
success
GPU
)
call
c_f_pointer
(
tmat_host
,
tmat
,(/
max_stored_rows
,
max_stored_rows
/))
num
=
(
max_local_cols
*
max_stored_rows
)
*
size_of_datatype
success
CUDA
=
gpu_malloc_host
(
tmp1_host
,
num
)
check_alloc_cuda
(
"trans_ev: tmp1_host"
,
success
CUDA
)
success
GPU
=
gpu_malloc_host
(
tmp1_host
,
num
)
check_alloc_cuda
(
"trans_ev: tmp1_host"
,
success
GPU
)
call
c_f_pointer
(
tmp1_host
,
tmp1
,(/(
max_local_cols
*
max_stored_rows
)/))
num
=
(
max_local_cols
*
max_stored_rows
)
*
size_of_datatype
success
CUDA
=
gpu_malloc_host
(
tmp2_host
,
num
)
check_alloc_cuda
(
"trans_ev: tmp2_host"
,
success
CUDA
)
success
GPU
=
gpu_malloc_host
(
tmp2_host
,
num
)
check_alloc_cuda
(
"trans_ev: tmp2_host"
,
success
GPU
)
call
c_f_pointer
(
tmp2_host
,
tmp2
,(/(
max_local_cols
*
max_stored_rows
)/))
success
CUDA
=
gpu_malloc
(
tmat_dev
,
max_stored_rows
*
max_stored_rows
*
size_of_datatype
)
check_alloc_cuda
(
"trans_ev"
,
success
CUDA
)
success
GPU
=
gpu_malloc
(
tmat_dev
,
max_stored_rows
*
max_stored_rows
*
size_of_datatype
)
check_alloc_cuda
(
"trans_ev"
,
success
GPU
)
success
CUDA
=
gpu_malloc
(
hvm_dev
,
max_local_rows
*
max_stored_rows
*
size_of_datatype
)
check_alloc_cuda
(
"trans_ev"
,
success
CUDA
)
success
GPU
=
gpu_malloc
(
hvm_dev
,
max_local_rows
*
max_stored_rows
*
size_of_datatype
)
check_alloc_cuda
(
"trans_ev"
,
success
GPU
)
success
CUDA
=
gpu_malloc
(
tmp_dev
,
max_local_cols
*
max_stored_rows
*
size_of_datatype
)
check_alloc_cuda
(
"trans_ev"
,
success
CUDA
)
success
GPU
=
gpu_malloc
(
tmp_dev
,
max_local_cols
*
max_stored_rows
*
size_of_datatype
)
check_alloc_cuda
(
"trans_ev"
,
success
GPU
)
num
=
ldq
*
matrixCols
*
size_of_datatype
success
CUDA
=
gpu_malloc
(
q_dev
,
num
)
check_alloc_cuda
(
"trans_ev"
,
success
CUDA
)
success
GPU
=
gpu_malloc
(
q_dev
,
num
)
check_alloc_cuda
(
"trans_ev"
,
success
GPU
)
success
CUDA
=
gpu_host_register
(
int
(
loc
(
q_mat
),
kind
=
c_intptr_t
),
num
,&
success
GPU
=
gpu_host_register
(
int
(
loc
(
q_mat
),
kind
=
c_intptr_t
),
num
,&
gpuHostRegisterDefault
)
check_host_register_cuda
(
"trans_ev: q_mat"
,
success
CUDA
)
check_host_register_cuda
(
"trans_ev: q_mat"
,
success
GPU
)
success
CUDA
=
gpu_memcpy
(
q_dev
,
int
(
loc
(
q_mat
(
1
,
1
)),
kind
=
c_intptr_t
),
&
success
GPU
=
gpu_memcpy
(
q_dev
,
int
(
loc
(
q_mat
(
1
,
1
)),
kind
=
c_intptr_t
),
&
num
,
gpuMemcpyHostToDevice
)
check_memcpy_cuda
(
"trans_ev"
,
success
CUDA
)
check_memcpy_cuda
(
"trans_ev"
,
success
GPU
)
endif
! useGPU
do
istep
=
1
,
na
,
blockStep
...
...
@@ -387,15 +387,15 @@ subroutine trans_ev_&
hvm1
(
1
:
hvm_ubnd
*
nstor
)
=
reshape
(
hvm
(
1
:
hvm_ubnd
,
1
:
nstor
),
(/
hvm_ubnd
*
nstor
/))
!hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
success
CUDA
=
gpu_memcpy
(
hvm_dev
,
int
(
loc
(
hvm1
(
1
)),
kind
=
c_intptr_t
),
&
success
GPU
=
gpu_memcpy
(
hvm_dev
,
int
(
loc
(
hvm1
(
1
)),
kind
=
c_intptr_t
),
&
hvm_ubnd
*
nstor
*
size_of_datatype
,
gpuMemcpyHostToDevice
)
check_memcpy_cuda
(
"trans_ev"
,
success
CUDA
)
check_memcpy_cuda
(
"trans_ev"
,
success
GPU
)
!tmat_dev = tmat
success
CUDA
=
gpu_memcpy
(
tmat_dev
,
int
(
loc
(
tmat
(
1
,
1
)),
kind
=
c_intptr_t
),
&
success
GPU
=
gpu_memcpy
(
tmat_dev
,
int
(
loc
(
tmat
(
1
,
1
)),
kind
=
c_intptr_t
),
&
max_stored_rows
*
max_stored_rows
*
size_of_datatype
,
gpuMemcpyHostToDevice
)
check_memcpy_cuda
(
"trans_ev"
,
success
CUDA
)
check_memcpy_cuda
(
"trans_ev"
,
success
GPU
)
endif
! Q = Q - V * T * V**T * Q
...
...
@@ -421,8 +421,8 @@ subroutine trans_ev_&
else
!l_rows>0
if
(
useGPU
)
then
success
CUDA
=
gpu_memset
(
tmp_dev
,
0
,
l_cols
*
nstor
*
size_of_datatype
)
check_memcpy_cuda
(
"trans_ev"
,
success
CUDA
)
success
GPU
=
gpu_memset
(
tmp_dev
,
0
,
l_cols
*
nstor
*
size_of_datatype
)
check_memcpy_cuda
(
"trans_ev"
,
success
GPU
)
else
tmp1
(
1
:
l_cols
*
nstor
)
=
0
endif
...
...
@@ -432,9 +432,9 @@ subroutine trans_ev_&
! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI
! todo: does it need to be copied whole? Wouldn't be a part sufficient?
if
(
useGPU
)
then
success
CUDA
=
gpu_memcpy
(
int
(
loc
(
tmp1
(
1
)),
kind
=
c_intptr_t
),
tmp_dev
,
&
success
GPU
=
gpu_memcpy
(
int
(
loc
(
tmp1
(
1
)),
kind
=
c_intptr_t
),
tmp_dev
,
&
max_local_cols
*
max_stored_rows
*
size_of_datatype
,
gpuMemcpyDeviceToHost
)
check_memcpy_cuda
(
"trans_ev"
,
success
CUDA
)
check_memcpy_cuda
(
"trans_ev"
,
success
GPU
)
endif
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_allreduce
(
tmp1
,
tmp2
,
int
(
nstor
*
l_cols
,
kind
=
MPI_KIND
),
MPI_MATH_DATATYPE_PRECISION
,
MPI_SUM
,
&
...
...
@@ -442,9 +442,9 @@ subroutine trans_ev_&
call
obj
%
timer
%
stop
(
"mpi_communication"
)
! copy back tmp2 - after reduction...
if
(
useGPU
)
then
success
CUDA
=
gpu_memcpy
(
tmp_dev
,
int
(
loc
(
tmp2
(
1
)),
kind
=
c_intptr_t
),
&
success
GPU
=
gpu_memcpy
(
tmp_dev
,
int
(
loc
(
tmp2
(
1
)),
kind
=
c_intptr_t
),
&
max_local_cols
*
max_stored_rows
*
size_of_datatype
,
gpuMemcpyHostToDevice
)
check_memcpy_cuda
(
"trans_ev"
,
success
CUDA
)
check_memcpy_cuda
(
"trans_ev"
,
success
GPU
)
endif
! useGPU
...
...
@@ -498,27 +498,27 @@ subroutine trans_ev_&
if
(
useGPU
)
then
!q_mat = q_dev
success
CUDA
=
gpu_memcpy
(
int
(
loc
(
q_mat
(
1
,
1
)),
kind
=
c_intptr_t
),
&
success
GPU
=
gpu_memcpy
(
int
(
loc
(
q_mat
(
1
,
1
)),
kind
=
c_intptr_t
),
&
q_dev
,
ldq
*
matrixCols
*
size_of_datatype
,
gpuMemcpyDeviceToHost
)
check_memcpy_cuda
(
"trans_ev"
,
success
CUDA
)
check_memcpy_cuda
(
"trans_ev"
,
success
GPU
)
success
CUDA
=
gpu_host_unregister
(
int
(
loc
(
q_mat
),
kind
=
c_intptr_t
))
check_host_unregister_cuda
(
"trans_ev: q_mat"
,
success
CUDA
)
success
GPU
=
gpu_host_unregister
(
int
(
loc
(
q_mat
),
kind
=
c_intptr_t
))
check_host_unregister_cuda
(
"trans_ev: q_mat"
,
success
GPU
)
success
CUDA
=
gpu_free_host
(
hvm1_host
)
check_host_dealloc_cuda
(
"trans_ev: hvm1_host"
,
success
CUDA
)
success
GPU
=
gpu_free_host
(
hvm1_host
)
check_host_dealloc_cuda
(
"trans_ev: hvm1_host"
,
success
GPU
)
nullify
(
hvm1
)
success
CUDA
=
gpu_free_host
(
tmat_host
)
check_host_dealloc_cuda
(
"trans_ev: tmat_host"
,
success
CUDA
)
success
GPU
=
gpu_free_host
(
tmat_host
)
check_host_dealloc_cuda
(
"trans_ev: tmat_host"
,
success
GPU
)
nullify
(
tmat
)
success
CUDA
=
gpu_free_host
(
tmp1_host
)
check_host_dealloc_cuda
(
"trans_ev: tmp1_host"
,
success
CUDA
)
success
GPU
=
gpu_free_host
(
tmp1_host
)
check_host_dealloc_cuda
(
"trans_ev: tmp1_host"
,
success
GPU
)
nullify
(
tmp1
)
success
CUDA
=
gpu_free_host
(
tmp2_host
)
check_host_dealloc_cuda
(
"trans_ev: tmp2_host"
,
success
CUDA
)
success
GPU
=
gpu_free_host
(
tmp2_host
)
check_host_dealloc_cuda
(
"trans_ev: tmp2_host"
,
success
GPU
)
nullify
(
tmp2
)
!deallocate(hvm1, stat=istat, errmsg=errorMessage)
...
...
@@ -530,17 +530,17 @@ subroutine trans_ev_&
!endif
!deallocate(q_dev, tmp_dev, hvm_dev, tmat_dev)
success
CUDA
=
gpu_free
(
q_dev
)
check_dealloc_cuda
(
"trans_ev"
,
success
CUDA
)
success
GPU
=
gpu_free
(
q_dev
)
check_dealloc_cuda
(
"trans_ev"
,
success
GPU
)
success
CUDA
=
gpu_free
(
tmp_dev
)
check_dealloc_cuda
(
"trans_ev"
,
success
CUDA
)
success
GPU
=
gpu_free
(
tmp_dev
)
check_dealloc_cuda
(
"trans_ev"
,
success
GPU
)
success
CUDA
=
gpu_free
(
hvm_dev
)
check_dealloc_cuda
(
"trans_ev"
,
success
CUDA
)
success
GPU
=
gpu_free
(
hvm_dev
)
check_dealloc_cuda
(
"trans_ev"
,
success
GPU
)
success
CUDA
=
gpu_free
(
tmat_dev
)
check_dealloc_cuda
(
"trans_ev"
,
success
CUDA
)
success
GPU
=
gpu_free
(
tmat_dev
)
check_dealloc_cuda
(
"trans_ev"
,
success
GPU
)
else
deallocate
(
tmat
,
tmp1
,
tmp2
,
stat
=
istat
,
errmsg
=
errorMessage
)
check_deallocate
(
"trans_ev_&
...
...
src/elpa1/elpa1_tridiag_template.F90
View file @
f515b7b2
...
...
@@ -139,7 +139,7 @@ subroutine tridiag_&
integer
(
kind
=
C_intptr_T
)
::
a_dev
,
v_row_dev
,
v_col_dev
,
u_row_dev
,
u_col_dev
,
vu_stored_rows_dev
,
&
uv_stored_cols_dev
logical
::
success
CUDA
logical
::
success
GPU
integer
(
kind
=
ik
)
::
istep
,
i
,
j
,
l_col_beg
,
l_col_end
,
l_row_beg
,
l_row_end
integer
(
kind
=
ik
)
::
tile_size
,
l_rows_per_tile
,
l_cols_per_tile
...
...
@@ -290,52 +290,52 @@ subroutine tridiag_&
if
(
useGPU
)
then
num
=
(
max_local_rows
+1
)
*
size_of_datatype
success
CUDA
=
gpu_malloc_host
(
v_row_host
,
num
)
check_host_alloc_cuda
(
"tridiag: v_row_host"
,
success
CUDA
)
success
GPU
=
gpu_malloc_host
(
v_row_host
,
num
)
check_host_alloc_cuda
(
"tridiag: v_row_host"
,
success
GPU
)
call
c_f_pointer
(
v_row_host
,
v_row
,(/(
max_local_rows
+1
)/))
num
=
(
max_local_cols
)
*
size_of_datatype
success
CUDA
=
gpu_malloc_host
(
v_col_host
,
num
)
check_host_alloc_cuda
(
"tridiag: v_col_host"
,
success
CUDA
)
success
GPU
=
gpu_malloc_host
(
v_col_host
,
num
)
check_host_alloc_cuda
(
"tridiag: v_col_host"
,
success
GPU
)
call
c_f_pointer
(
v_col_host
,
v_col
,(/(
max_local_cols
)/))
num
=
(
max_local_cols
)
*
size_of_datatype
success
CUDA
=
gpu_malloc_host
(
u_col_host
,
num
)
check_host_alloc_cuda
(
"tridiag: u_col_host"
,
success
CUDA
)
success
GPU
=
gpu_malloc_host
(
u_col_host
,
num
)
check_host_alloc_cuda
(
"tridiag: u_col_host"
,
success
GPU
)
call
c_f_pointer
(
u_col_host
,
u_col
,(/(
max_local_cols
)/))
num
=
(
max_local_rows
)
*
size_of_datatype
success
CUDA
=
gpu_malloc_host
(
u_row_host
,
num
)
check_host_alloc_cuda
(
"tridiag: u_row_host"
,
success
CUDA
)
success
GPU
=
gpu_malloc_host
(
u_row_host
,
num
)
check_host_alloc_cuda
(
"tridiag: u_row_host"
,
success
GPU
)
call
c_f_pointer
(
u_row_host
,
u_row
,(/(
max_local_rows
)/))
num
=
(
max_local_rows
*
2
*
max_stored_uv
)
*
size_of_datatype
success
CUDA
=
gpu_host_register
(
int
(
loc
(
vu_stored_rows
),
kind
=
c_intptr_t
),
num
,&
success
GPU
=
gpu_host_register
(
int
(
loc
(
vu_stored_rows
),
kind
=
c_intptr_t
),
num
,&
gpuHostRegisterDefault
)
check_host_register_cuda
(
"tridiag: vu_stored_roes"
,
success
CUDA
)
check_host_register_cuda
(
"tridiag: vu_stored_roes"
,
success
GPU
)
num
=
(
max_local_cols
*
2
*
max_stored_uv
)
*
size_of_datatype
success
CUDA
=
gpu_host_register
(
int
(
loc
(
uv_stored_cols
),
kind
=
c_intptr_t
),
num
,&
success
GPU
=
gpu_host_register
(
int
(
loc
(
uv_stored_cols
),
kind
=
c_intptr_t
),
num
,&
gpuHostRegisterDefault
)
check_host_register_cuda
(
"tridiag: uv_stored_cols"
,
success
CUDA
)
check_host_register_cuda
(
"tridiag: uv_stored_cols"
,
success
GPU
)
#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
num
=
na
*
8
#else
num
=
na
*
4
#endif
success
CUDA
=
gpu_host_register
(
int
(
loc
(
e_vec
),
kind
=
c_intptr_t
),
num
,&
success
GPU
=
gpu_host_register
(
int
(
loc
(
e_vec
),
kind
=
c_intptr_t
),
num
,&
gpuHostRegisterDefault
)
check_host_register_cuda
(
"tridiag: e_vec"
,
success
CUDA
)
check_host_register_cuda
(
"tridiag: e_vec"
,
success
GPU
)
#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
num
=
na
*
8
#else
num
=
na
*
4
#endif
success
CUDA
=
gpu_host_register
(
int
(
loc
(
d_vec
),
kind
=
c_intptr_t
),
num
,&
success
GPU
=
gpu_host_register
(
int
(
loc
(
d_vec
),
kind
=
c_intptr_t
),
num
,&
gpuHostRegisterDefault
)
check_host_register_cuda
(
"tridiag: d_vec"
,
success
CUDA
)
check_host_register_cuda
(
"tridiag: d_vec"
,
success
GPU
)
else
allocate
(
v_row
(
max_local_rows
+1
),
stat
=
istat
,
errmsg
=
errorMessage
)
...
...
@@ -373,24 +373,24 @@ subroutine tridiag_&
u_col
=
0
if
(
useGPU
)
then
success
CUDA
=
gpu_malloc
(
v_row_dev
,
max_local_rows
*
size_of_datatype
)
check_alloc_cuda
(
"tridiag: v_row_dev"
,
success
CUDA
)
success
GPU
=
gpu_malloc
(
v_row_dev
,
max_local_rows
*
size_of_datatype
)
check_alloc_cuda
(
"tridiag: v_row_dev"
,
success
GPU
)
success
CUDA
=
gpu_malloc
(
u_row_dev
,
max_local_rows
*
size_of_datatype
)
success
GPU
=
gpu_malloc
(
u_row_dev
,
max_local_rows
*
size_of_datatype
)
check_alloc_cuda
(
"tridiag: u_row_dev"
,
success
CUDA
)
check_alloc_cuda
(
"tridiag: u_row_dev"
,
success
GPU
)
success
CUDA
=
gpu_malloc
(
v_col_dev
,
max_local_cols
*
size_of_datatype
)
check_alloc_cuda
(
"tridiag: v_col_dev"
,
success
CUDA
)
success
GPU
=
gpu_malloc
(
v_col_dev
,
max_local_cols
*
size_of_datatype
)
check_alloc_cuda
(
"tridiag: v_col_dev"
,
success
GPU
)
success
CUDA
=
gpu_malloc
(
u_col_dev
,
max_local_cols
*
size_of_datatype
)
check_alloc_cuda
(
"tridiag: u_col_dev"
,
success
CUDA
)
success
GPU
=
gpu_malloc
(
u_col_dev
,
max_local_cols
*
size_of_datatype
)
check_alloc_cuda
(
"tridiag: u_col_dev"
,
success
GPU
)
success
CUDA
=
gpu_malloc
(
vu_stored_rows_dev
,
max_local_rows
*
2
*
max_stored_uv
*
size_of_datatype
)
check_alloc_cuda
(
"tridiag: vu_stored_rows_dev"
,
success
CUDA
)
success
GPU
=
gpu_malloc
(
vu_stored_rows_dev
,
max_local_rows
*
2
*
max_stored_uv
*
size_of_datatype
)
check_alloc_cuda
(
"tridiag: vu_stored_rows_dev"
,
success
GPU
)
success
CUDA
=
gpu_malloc
(
uv_stored_cols_dev
,
max_local_cols
*
2
*
max_stored_uv
*
size_of_datatype
)
check_alloc_cuda
(
"tridiag: vu_stored_rows_dev"
,
success
CUDA
)
success
GPU
=
gpu_malloc
(
uv_stored_cols_dev
,
max_local_cols
*
2
*
max_stored_uv
*
size_of_datatype
)
check_alloc_cuda
(
"tridiag: vu_stored_rows_dev"
,
success
GPU
)
endif
!useGPU
...
...
@@ -416,16 +416,16 @@ subroutine tridiag_&
num
=
matrixRows
*
matrixCols
*
size_of_datatype
success
CUDA
=
gpu_malloc
(
a_dev
,
num
)
check_alloc_cuda
(
"tridiag: a_dev"
,
success
CUDA
)
success
GPU
=
gpu_malloc
(
a_dev
,
num
)
check_alloc_cuda
(
"tridiag: a_dev"
,
success
GPU
)
success
CUDA
=
gpu_host_register
(
int
(
loc
(
a_mat
),
kind
=
c_intptr_t
),
num
,&
success
GPU
=
gpu_host_register
(
int
(
loc
(
a_mat
),
kind
=
c_intptr_t
),
num
,&
gpuHostRegisterDefault
)
check_host_register_cuda
(
"tridiag: a_mat"
,
success
CUDA
)
check_host_register_cuda
(
"tridiag: a_mat"
,
success
GPU
)
success
CUDA
=
gpu_memcpy
(
a_dev
,
int
(
loc
(
a_mat
(
1
,
1
)),
kind
=
c_intptr_t
),
&
success
GPU
=
gpu_memcpy
(
a_dev
,
int
(
loc
(
a_mat
(
1
,
1
)),
kind
=
c_intptr_t
),
&
num
,
gpuMemcpyHostToDevice
)
check_memcpy_cuda
(
"tridiag: a_dev"
,
success
CUDA
)
check_memcpy_cuda
(
"tridiag: a_dev"
,
success
GPU
)
endif
! main cycle of tridiagonalization
...
...
@@ -448,12 +448,12 @@ subroutine tridiag_&
! copy l_cols + 1 column of A to v_row
if
(
useGPU
)
then
a_offset
=
l_cols
*
matrixRows
*
size_of_datatype
! we use v_row on the host at the moment! success
CUDA
= cuda_memcpy(v_row_dev, a_dev + a_offset,
! we use v_row on the host at the moment! success
GPU
= cuda_memcpy(v_row_dev, a_dev + a_offset,
! (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
success
CUDA
=
gpu_memcpy
(
int
(
loc
(
v_row
),
kind
=
c_intptr_t
),
&
success
GPU
=
gpu_memcpy
(
int
(
loc
(
v_row
),
kind
=
c_intptr_t
),
&
a_dev
+
a_offset
,
(
l_rows
)
*
size_of_datatype
,
gpuMemcpyDeviceToHost
)
check_memcpy_cuda
(
"tridiag a_dev 1"
,
success
CUDA
)
check_memcpy_cuda
(
"tridiag a_dev 1"
,
success
GPU
)
else
v_row
(
1
:
l_rows
)
=
a_mat
(
1
:
l_rows
,
l_cols
+1
)
endif
...
...
@@ -564,20 +564,20 @@ subroutine tridiag_&
u_row
(
1
:
l_rows
)
=
0
if
(
l_rows
>
0
.and.
l_cols
>
0
)
then
if
(
useGPU
)
then
success
CUDA
=
gpu_memset
(
u_col_dev
,
0
,
l_cols
*
size_of_datatype
)
check_memcpy_cuda
(
"tridiag: u_col_dev"
,
success
CUDA
)
success
GPU
=
gpu_memset
(
u_col_dev
,
0
,
l_cols
*
size_of_datatype
)
check_memcpy_cuda
(
"tridiag: u_col_dev"
,
success
GPU
)
success
CUDA
=
gpu_memset
(
u_row_dev
,
0
,
l_rows
*
size_of_datatype
)
check_memcpy_cuda
(
"tridiag: u_row_dev"
,
success
CUDA
)
success
GPU
=
gpu_memset
(
u_row_dev
,
0
,
l_rows
*
size_of_datatype
)
check_memcpy_cuda
(
"tridiag: u_row_dev"
,
success
GPU
)
success
CUDA
=
gpu_memcpy
(
v_col_dev
,
int
(
loc
(
v_col
(
1
)),
kind
=
c_intptr_t
),
&
success
GPU
=
gpu_memcpy
(
v_col_dev
,
int
(
loc
(
v_col
(
1
)),
kind
=
c_intptr_t
),
&
l_cols
*
size_of_datatype
,
gpuMemcpyHostToDevice
)
check_memcpy_cuda
(
"tridiag: v_col_dev"
,
success
CUDA
)
check_memcpy_cuda
(
"tridiag: v_col_dev"
,
success
GPU
)
success
CUDA
=
gpu_memcpy
(
v_row_dev
,
int
(
loc
(
v_row
(
1
)),
kind
=
c_intptr_t
),
&
success
GPU
=
gpu_memcpy
(
v_row_dev
,
int
(
loc
(
v_row
(
1
)),
kind
=
c_intptr_t
),
&
l_rows
*
size_of_datatype
,
gpuMemcpyHostToDevice
)
check_memcpy_cuda
(
"tridiag: v_row_dev"
,
success
CUDA
)
check_memcpy_cuda
(
"tridiag: v_row_dev"
,
success
GPU
)
endif
! useGU
#ifdef WITH_OPENMP_TRADITIONAL
...
...
@@ -589,7 +589,7 @@ subroutine tridiag_&
!$omp num_threads(max_threads) &
!$omp default(none) &
!$omp private(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end) &
!$omp shared(useGPU, isSkewsymmetric, gpuMemcpyDeviceToHost, success
Cuda
, u_row, u_row_dev, &
!$omp shared(useGPU, isSkewsymmetric, gpuMemcpyDeviceToHost, success
GPU
, u_row, u_row_dev, &
!$omp & v_row, v_row_dev, v_col, v_col_dev, u_col, u_col_dev, a_dev, a_offset, &
!$omp& max_local_cols, max_local_rows, obj, wantDebug, l_rows_per_tile, l_cols_per_tile, &
!$omp& matrixRows, istep, tile_size, l_rows, l_cols, ur_p, uc_p, a_mat)
...
...
@@ -741,13 +741,13 @@ subroutine tridiag_&
enddo
end
if
!multiplication as one block / per stripes
success
CUDA
=
gpu_memcpy
(
int
(
loc
(
u_col
(
1
)),
kind
=
c_intptr_t
),
&
success
GPU
=
gpu_memcpy
(
int
(
loc
(
u_col
(
1
)),
kind
=
c_intptr_t
),
&
u_col_dev
,
l_cols
*
size_of_datatype
,
gpuMemcpyDeviceToHost
)
check_memcpy_cuda
(
"tridiag: u_col_dev 1"
,
success
CUDA
)
check_memcpy_cuda
(
"tridiag: u_col_dev 1"
,
success
GPU
)
success
CUDA
=
gpu_memcpy
(
int
(
loc
(
u_row
(
1
)),
kind
=
c_intptr_t
),
&
success
GPU
=
gpu_memcpy
(
int
(
loc
(
u_row
(
1
)),
kind
=
c_intptr_t
),
&
u_row_dev
,
l_rows
*
size_of_datatype
,
gpuMemcpyDeviceToHost
)
check_memcpy_cuda
(
"tridiag: u_row_dev 1"
,
success
CUDA
)
check_memcpy_cuda
(
"tridiag: u_row_dev 1"
,
success
GPU
)
endif
! useGPU
#ifdef WITH_OPENMP_TRADITIONAL
...
...
@@ -873,15 +873,15 @@ subroutine tridiag_&
if
(
n_stored_vecs
==
max_stored_uv
.or.
istep
==
3
)
then
if
(
useGPU
)
then
success
CUDA
=
gpu_memcpy
(
vu_stored_rows_dev
,
int
(
loc
(
vu_stored_rows
(
1
,
1
)),
kind
=
c_intptr_t
),
&
success
GPU
=
gpu_memcpy
(
vu_stored_rows_dev
,
int
(
loc
(
vu_stored_rows
(
1
,
1
)),
kind
=
c_intptr_t
),
&
max_local_rows
*
2
*
max_stored_uv
*
&
size_of_datatype
,
gpuMemcpyHostToDevice
)
check_memcpy_cuda
(
"tridiag: uv_stored_rows_dev"
,
success
CUDA
)
check_memcpy_cuda
(
"tridiag: uv_stored_rows_dev"
,
success
GPU
)
success
CUDA
=
gpu_memcpy
(
uv_stored_cols_dev
,
int
(
loc
(
uv_stored_cols
(
1
,
1
)),
kind
=
c_intptr_t
),
&
success
GPU
=
gpu_memcpy
(
uv_stored_cols_dev
,
int
(
loc
(
uv_stored_cols
(
1
,
1
)),
kind
=
c_intptr_t
),
&
max_local_cols
*
2
*
max_stored_uv
*
&
size_of_datatype
,
gpuMemcpyHostToDevice
)
check_memcpy_cuda
(
"tridiag: uv_stored_cols_dev"
,
success
CUDA
)
check_memcpy_cuda
(
"tridiag: uv_stored_cols_dev"
,
success
GPU
)
endif
do
i
=
0
,
(
istep
-2
)/
tile_size
...
...
@@ -944,9 +944,9 @@ subroutine tridiag_&
!a_mat(l_rows,l_cols) = a_dev(l_rows,l_cols)
a_offset
=
((
l_rows
-
1
)
+
matrixRows
*
(
l_cols
-
1
))
*
size_of_datatype
success
CUDA
=
gpu_memcpy
(
int
(
loc
(
a_mat
(
l_rows
,
l_cols
)),
kind
=
c_intptr_t
),
a_dev
+
a_offset
,
&
success
GPU
=
gpu_memcpy
(
int
(
loc
(
a_mat
(
l_rows
,
l_cols
)),
kind
=
c_intptr_t
),
a_dev
+
a_offset
,
&
1
*
size_of_datatype
,
gpuMemcpyDeviceToHost
)
check_memcpy_cuda
(
"tridiag: a_dev 3"
,
success
CUDA
)
check_memcpy_cuda
(
"tridiag: a_dev 3"
,
success
GPU
)
endif
if
(
n_stored_vecs
>
0
)
then
...
...
@@ -966,12 +966,12 @@ subroutine tridiag_&
if
(
useGPU
)
then
!a_dev(l_rows,l_cols) = a_mat(l_rows,l_cols)
!success
CUDA
= cuda_threadsynchronize()
!check_memcpy_cuda("tridiag: a_dev 4a5a", success
CUDA
)
!success
GPU
= cuda_threadsynchronize()
!check_memcpy_cuda("tridiag: a_dev 4a5a", success
GPU
)
success
CUDA
=
gpu_memcpy
(
a_dev
+
a_offset
,
int
(
loc
(
a_mat
(
l_rows
,
l_cols
)),
kind
=
c_intptr_t
),
&
success
GPU
=
gpu_memcpy
(
a_dev
+
a_offset
,
int
(
loc
(
a_mat
(
l_rows
,
l_cols
)),
kind
=
c_intptr_t
),
&
int
(
1
*
size_of_datatype
,
kind
=
c_intptr_t
),
gpuMemcpyHostToDevice
)
check_memcpy_cuda
(
"tridiag: a_dev 4"
,
success
CUDA
)
check_memcpy_cuda
(
"tridiag: a_dev 4"
,
success
GPU
)
endif
endif
...
...
@@ -984,9 +984,9 @@ subroutine tridiag_&
if
(
my_prow
==
prow
(
1
,
nblk
,
np_rows
))
then
! We use last l_cols value of loop above
if
(
useGPU
)
then
success
CUDA
=
gpu_memcpy
(
int
(
loc
(
aux3
(
1
)),
kind
=
c_intptr_t
),
a_dev
+
(
matrixRows
*
(
l_cols
-
1
))
*
size_of_datatype
,
&
success
GPU
=
gpu_memcpy
(
int
(
loc
(
aux3
(
1
)),
kind
=
c_intptr_t
),
a_dev
+
(
matrixRows
*
(
l_cols
-
1
))
*
size_of_datatype
,
&
1
*
size_of_datatype
,
gpuMemcpyDeviceToHost
)
check_memcpy_cuda
(
"tridiag: a_dev 5"
,
success
CUDA
)
check_memcpy_cuda
(
"tridiag: a_dev 5"
,
success
GPU
)
vrl
=
aux3
(
1
)
else
!useGPU
vrl
=
a_mat
(
1
,
l_cols
)
...
...
@@ -1020,9 +1020,9 @@ subroutine tridiag_&
#endif /* WITH_MPI */
if
(
my_prow
==
prow
(
1
,
nblk
,
np_rows
)
.and.
my_pcol
==
pcol
(
1
,
nblk
,
np_cols
))
then
if
(
useGPU
)
then
success
CUDA
=
gpu_memcpy
(
int
(
loc
(
aux3
(
1
)),
kind
=
c_intptr_t
),
a_dev
,
&
success
GPU
=
gpu_memcpy
(
int
(
loc
(
aux3
(
1
)),
kind
=
c_intptr_t
),
a_dev
,
&
1
*
size_of_datatype
,
gpuMemcpyDeviceToHost
)
check_memcpy_cuda
(
"tridiag: a_dev 6"
,
success
CUDA
)
check_memcpy_cuda
(
"tridiag: a_dev 6"
,
success
GPU
)
d_vec
(
1
)
=
PRECISION_REAL
(
aux3
(
1
))
else
!useGPU
d_vec
(
1
)
=
PRECISION_REAL
(
a_mat
(
1
,
1
))
...
...
@@ -1036,9 +1036,9 @@ subroutine tridiag_&
if
(
my_prow
==
prow
(
1
,
nblk
,
np_rows
)
.and.
my_pcol
==
pcol
(
2
,
nblk
,
np_cols
))
then
if
(
useGPU
)
then
success
CUDA
=
gpu_memcpy
(
int
(
loc
(
e_vec
(
1
)),
kind
=
c_intptr_t
),
a_dev
+
(
matrixRows
*
(
l_cols
-
1
))
*
size_of_datatype
,
&
success
GPU
=
gpu_memcpy
(
int
(
loc
(
e_vec
(
1
)),
kind
=
c_intptr_t
),
a_dev
+
(
matrixRows
*
(
l_cols
-
1
))
*
size_of_datatype
,
&
1
*
size_of_datatype
,
gpuMemcpyDeviceToHost
)
check_memcpy_cuda
(
"tridiag: a_dev 7"
,
success
CUDA
)
check_memcpy_cuda
(
"tridiag: a_dev 7"
,
success
GPU
)
else
!useGPU
e_vec
(
1
)
=
a_mat
(
1
,
l_cols
)
! use last l_cols value of loop above
endif
!useGPU
...
...
@@ -1047,8 +1047,8 @@ subroutine tridiag_&
! Store d_vec(1)
if
(
my_prow
==
prow
(
1
,
nblk
,
np_rows
)
.and.
my_pcol
==
pcol
(
1
,
nblk
,
np_cols
))
then
if
(
useGPU
)
then
success
CUDA
=
gpu_memcpy
(
int
(
loc
(
d_vec
(
1
)),
kind
=
c_intptr_t
),
a_dev
,
1
*
size_of_datatype
,
gpuMemcpyDeviceToHost
)
check_memcpy_cuda
(
"tridiag: a_dev 8"
,
success
CUDA
)
success
GPU
=
gpu_memcpy
(
int
(
loc
(
d_vec
(
1
)),
kind
=
c_intptr_t
),
a_dev
,
1
*
size_of_datatype
,
gpuMemcpyDeviceToHost
)
check_memcpy_cuda
(
"tridiag: a_dev 8"
,
success
GPU
)
else
!useGPU
if
(
isSkewsymmetric
)
then
d_vec
(
1
)
=
0.0_rk
...
...
@@ -1064,26 +1064,26 @@ subroutine tridiag_&
if
(
useGPU
)
then
! todo: should we leave a_mat on the device for further use?
success
CUDA
=
gpu_free
(
a_dev
)
check_dealloc_cuda
(
"tridiag: a_dev 9"
,
success
CUDA
)
success
GPU
=
gpu_free
(
a_dev
)
check_dealloc_cuda
(
"tridiag: a_dev 9"
,
success
GPU
)
success
CUDA
=
gpu_free
(
v_row_dev
)
check_dealloc_cuda
(
"tridiag: v_row_dev"
,
success
CUDA
)
success
GPU
=
gpu_free
(
v_row_dev
)
check_dealloc_cuda
(
"tridiag: v_row_dev"
,
success
GPU
)
success
CUDA
=
gpu_free
(
u_row_dev
)
check_dealloc_cuda
(
"tridiag: (u_row_dev"
,
success
CUDA
)
success
GPU
=
gpu_free
(
u_row_dev
)
check_dealloc_cuda
(
"tridiag: (u_row_dev"
,
success
GPU
)
success
CUDA
=
gpu_free
(
v_col_dev
)
check_dealloc_cuda
(
"tridiag: v_col_dev"
,
success
CUDA
)
success
GPU
=
gpu_free
(
v_col_dev
)
check_dealloc_cuda
(
"tridiag: v_col_dev"
,
success
GPU
)
success
CUDA
=
gpu_free
(
u_col_dev
)
check_dealloc_cuda
(
"tridiag: u_col_dev "
,
success
CUDA
)
success
GPU
=
gpu_free
(
u_col_dev
)
check_dealloc_cuda
(
"tridiag: u_col_dev "
,
success
GPU
)
success
CUDA
=
gpu_free
(
vu_stored_rows_dev
)
check_dealloc_cuda
(
"tridiag: vu_stored_rows_dev "
,
success
CUDA
)
success
GPU
=
gpu_free
(
vu_stored_rows_dev
)
check_dealloc_cuda
(
"tridiag: vu_stored_rows_dev "
,
success
GPU
)
success
CUDA
=
gpu_free
(
uv_stored_cols_dev
)
check_dealloc_cuda
(
"tridiag:uv_stored_cols_dev "
,
success
CUDA
)
success
GPU
=
gpu_free
(
uv_stored_cols_dev
)
check_dealloc_cuda
(
"tridiag:uv_stored_cols_dev "
,
success
GPU
)
endif
! distribute the arrays d_vec and e_vec to all processors
...
...
@@ -1112,36 +1112,36 @@ subroutine tridiag_&
check_deallocate
(
"tridiag: tmp_real"
,
istat
,
errorMessage
)
if
(
useGPU
)
then
success
CUDA
=
gpu_host_unregister
(
int
(
loc
(
a_mat
),
kind
=
c_intptr_t
))
check_host_unregister_cuda
(
"tridiag: a_mat"
,
success
CUDA
)
success
GPU
=
gpu_host_unregister
(
int
(
loc
(
a_mat
),
kind
=
c_intptr_t
))
check_host_unregister_cuda
(
"tridiag: a_mat"
,
success
GPU
)
success
CUDA
=
gpu_free_host
(
v_row_host
)
check_host_dealloc_cuda
(
"tridiag: v_row_host"
,
success
CUDA
)
success
GPU
=
gpu_free_host
(
v_row_host
)
check_host_dealloc_cuda
(
"tridiag: v_row_host"
,
success
GPU
)
nullify
(
v_row
)
success
CUDA
=
gpu_free_host
(
v_col_host
)
check_host_dealloc_cuda
(
"tridiag: v_col_host"
,
success
CUDA
)
success
GPU
=
gpu_free_host
(
v_col_host
)
check_host_dealloc_cuda
(
"tridiag: v_col_host"
,
success
GPU
)
nullify
(
v_col
)
success
CUDA
=
gpu_free_host
(
u_col_host
)
check_host_dealloc_cuda
(
"tridiag: u_col_host"
,
success
CUDA
)
success
GPU
=
gpu_free_host
(
u_col_host
)
check_host_dealloc_cuda
(
"tridiag: u_col_host"
,
success
GPU
)
nullify
(
u_col
)
success
CUDA
=
gpu_free_host
(
u_row_host
)
check_host_dealloc_cuda
(
"tridiag: u_row_host"
,
success
CUDA
)
success
GPU
=
gpu_free_host
(
u_row_host
)
check_host_dealloc_cuda