Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
fe2e6726
Commit
fe2e6726
authored
Mar 01, 2021
by
Andreas Marek
Browse files
Rename check_*_CUDA -> check_*_GPU
parent
f515b7b2
Changes
9
Hide whitespace changes
Inline
Side-by-side
src/elpa1/elpa1_trans_ev_template.F90
View file @
fe2e6726
...
...
@@ -243,44 +243,44 @@ subroutine trans_ev_&
!&", "hvm1", istat, errorMessage)
num
=
(
max_local_rows
*
max_stored_rows
)
*
size_of_datatype
successGPU
=
gpu_malloc_host
(
hvm1_host
,
num
)
check_alloc_
cuda
(
"trans_ev: hvm1_host"
,
successGPU
)
check_alloc_
gpu
(
"trans_ev: hvm1_host"
,
successGPU
)
call
c_f_pointer
(
hvm1_host
,
hvm1
,(/(
max_local_rows
*
max_stored_rows
)/))
num
=
(
max_stored_rows
*
max_stored_rows
)
*
size_of_datatype
successGPU
=
gpu_malloc_host
(
tmat_host
,
num
)
check_alloc_
cuda
(
"trans_ev: tmat_host"
,
successGPU
)
check_alloc_
gpu
(
"trans_ev: tmat_host"
,
successGPU
)
call
c_f_pointer
(
tmat_host
,
tmat
,(/
max_stored_rows
,
max_stored_rows
/))
num
=
(
max_local_cols
*
max_stored_rows
)
*
size_of_datatype
successGPU
=
gpu_malloc_host
(
tmp1_host
,
num
)
check_alloc_
cuda
(
"trans_ev: tmp1_host"
,
successGPU
)
check_alloc_
gpu
(
"trans_ev: tmp1_host"
,
successGPU
)
call
c_f_pointer
(
tmp1_host
,
tmp1
,(/(
max_local_cols
*
max_stored_rows
)/))
num
=
(
max_local_cols
*
max_stored_rows
)
*
size_of_datatype
successGPU
=
gpu_malloc_host
(
tmp2_host
,
num
)
check_alloc_
cuda
(
"trans_ev: tmp2_host"
,
successGPU
)
check_alloc_
gpu
(
"trans_ev: tmp2_host"
,
successGPU
)
call
c_f_pointer
(
tmp2_host
,
tmp2
,(/(
max_local_cols
*
max_stored_rows
)/))
successGPU
=
gpu_malloc
(
tmat_dev
,
max_stored_rows
*
max_stored_rows
*
size_of_datatype
)
check_alloc_
cuda
(
"trans_ev"
,
successGPU
)
check_alloc_
gpu
(
"trans_ev"
,
successGPU
)
successGPU
=
gpu_malloc
(
hvm_dev
,
max_local_rows
*
max_stored_rows
*
size_of_datatype
)
check_alloc_
cuda
(
"trans_ev"
,
successGPU
)
check_alloc_
gpu
(
"trans_ev"
,
successGPU
)
successGPU
=
gpu_malloc
(
tmp_dev
,
max_local_cols
*
max_stored_rows
*
size_of_datatype
)
check_alloc_
cuda
(
"trans_ev"
,
successGPU
)
check_alloc_
gpu
(
"trans_ev"
,
successGPU
)
num
=
ldq
*
matrixCols
*
size_of_datatype
successGPU
=
gpu_malloc
(
q_dev
,
num
)
check_alloc_
cuda
(
"trans_ev"
,
successGPU
)
check_alloc_
gpu
(
"trans_ev"
,
successGPU
)
successGPU
=
gpu_host_register
(
int
(
loc
(
q_mat
),
kind
=
c_intptr_t
),
num
,&
gpuHostRegisterDefault
)
check_host_register_
cuda
(
"trans_ev: q_mat"
,
successGPU
)
check_host_register_
gpu
(
"trans_ev: q_mat"
,
successGPU
)
successGPU
=
gpu_memcpy
(
q_dev
,
int
(
loc
(
q_mat
(
1
,
1
)),
kind
=
c_intptr_t
),
&
num
,
gpuMemcpyHostToDevice
)
check_memcpy_
cuda
(
"trans_ev"
,
successGPU
)
check_memcpy_
gpu
(
"trans_ev"
,
successGPU
)
endif
! useGPU
do
istep
=
1
,
na
,
blockStep
...
...
@@ -390,12 +390,12 @@ subroutine trans_ev_&
successGPU
=
gpu_memcpy
(
hvm_dev
,
int
(
loc
(
hvm1
(
1
)),
kind
=
c_intptr_t
),
&
hvm_ubnd
*
nstor
*
size_of_datatype
,
gpuMemcpyHostToDevice
)
check_memcpy_
cuda
(
"trans_ev"
,
successGPU
)
check_memcpy_
gpu
(
"trans_ev"
,
successGPU
)
!tmat_dev = tmat
successGPU
=
gpu_memcpy
(
tmat_dev
,
int
(
loc
(
tmat
(
1
,
1
)),
kind
=
c_intptr_t
),
&
max_stored_rows
*
max_stored_rows
*
size_of_datatype
,
gpuMemcpyHostToDevice
)
check_memcpy_
cuda
(
"trans_ev"
,
successGPU
)
check_memcpy_
gpu
(
"trans_ev"
,
successGPU
)
endif
! Q = Q - V * T * V**T * Q
...
...
@@ -422,7 +422,7 @@ subroutine trans_ev_&
if
(
useGPU
)
then
successGPU
=
gpu_memset
(
tmp_dev
,
0
,
l_cols
*
nstor
*
size_of_datatype
)
check_memcpy_
cuda
(
"trans_ev"
,
successGPU
)
check_memcpy_
gpu
(
"trans_ev"
,
successGPU
)
else
tmp1
(
1
:
l_cols
*
nstor
)
=
0
endif
...
...
@@ -434,7 +434,7 @@ subroutine trans_ev_&
if
(
useGPU
)
then
successGPU
=
gpu_memcpy
(
int
(
loc
(
tmp1
(
1
)),
kind
=
c_intptr_t
),
tmp_dev
,
&
max_local_cols
*
max_stored_rows
*
size_of_datatype
,
gpuMemcpyDeviceToHost
)
check_memcpy_
cuda
(
"trans_ev"
,
successGPU
)
check_memcpy_
gpu
(
"trans_ev"
,
successGPU
)
endif
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_allreduce
(
tmp1
,
tmp2
,
int
(
nstor
*
l_cols
,
kind
=
MPI_KIND
),
MPI_MATH_DATATYPE_PRECISION
,
MPI_SUM
,
&
...
...
@@ -444,7 +444,7 @@ subroutine trans_ev_&
if
(
useGPU
)
then
successGPU
=
gpu_memcpy
(
tmp_dev
,
int
(
loc
(
tmp2
(
1
)),
kind
=
c_intptr_t
),
&
max_local_cols
*
max_stored_rows
*
size_of_datatype
,
gpuMemcpyHostToDevice
)
check_memcpy_
cuda
(
"trans_ev"
,
successGPU
)
check_memcpy_
gpu
(
"trans_ev"
,
successGPU
)
endif
! useGPU
...
...
@@ -500,25 +500,25 @@ subroutine trans_ev_&
!q_mat = q_dev
successGPU
=
gpu_memcpy
(
int
(
loc
(
q_mat
(
1
,
1
)),
kind
=
c_intptr_t
),
&
q_dev
,
ldq
*
matrixCols
*
size_of_datatype
,
gpuMemcpyDeviceToHost
)
check_memcpy_
cuda
(
"trans_ev"
,
successGPU
)
check_memcpy_
gpu
(
"trans_ev"
,
successGPU
)
successGPU
=
gpu_host_unregister
(
int
(
loc
(
q_mat
),
kind
=
c_intptr_t
))
check_host_unregister_
cuda
(
"trans_ev: q_mat"
,
successGPU
)
check_host_unregister_
gpu
(
"trans_ev: q_mat"
,
successGPU
)
successGPU
=
gpu_free_host
(
hvm1_host
)
check_host_dealloc_
cuda
(
"trans_ev: hvm1_host"
,
successGPU
)
check_host_dealloc_
gpu
(
"trans_ev: hvm1_host"
,
successGPU
)
nullify
(
hvm1
)
successGPU
=
gpu_free_host
(
tmat_host
)
check_host_dealloc_
cuda
(
"trans_ev: tmat_host"
,
successGPU
)
check_host_dealloc_
gpu
(
"trans_ev: tmat_host"
,
successGPU
)
nullify
(
tmat
)
successGPU
=
gpu_free_host
(
tmp1_host
)
check_host_dealloc_
cuda
(
"trans_ev: tmp1_host"
,
successGPU
)
check_host_dealloc_
gpu
(
"trans_ev: tmp1_host"
,
successGPU
)
nullify
(
tmp1
)
successGPU
=
gpu_free_host
(
tmp2_host
)
check_host_dealloc_
cuda
(
"trans_ev: tmp2_host"
,
successGPU
)
check_host_dealloc_
gpu
(
"trans_ev: tmp2_host"
,
successGPU
)
nullify
(
tmp2
)
!deallocate(hvm1, stat=istat, errmsg=errorMessage)
...
...
@@ -531,16 +531,16 @@ subroutine trans_ev_&
!deallocate(q_dev, tmp_dev, hvm_dev, tmat_dev)
successGPU
=
gpu_free
(
q_dev
)
check_dealloc_
cuda
(
"trans_ev"
,
successGPU
)
check_dealloc_
gpu
(
"trans_ev"
,
successGPU
)
successGPU
=
gpu_free
(
tmp_dev
)
check_dealloc_
cuda
(
"trans_ev"
,
successGPU
)
check_dealloc_
gpu
(
"trans_ev"
,
successGPU
)
successGPU
=
gpu_free
(
hvm_dev
)
check_dealloc_
cuda
(
"trans_ev"
,
successGPU
)
check_dealloc_
gpu
(
"trans_ev"
,
successGPU
)
successGPU
=
gpu_free
(
tmat_dev
)
check_dealloc_
cuda
(
"trans_ev"
,
successGPU
)
check_dealloc_
gpu
(
"trans_ev"
,
successGPU
)
else
deallocate
(
tmat
,
tmp1
,
tmp2
,
stat
=
istat
,
errmsg
=
errorMessage
)
check_deallocate
(
"trans_ev_&
...
...
src/elpa1/elpa1_tridiag_template.F90
View file @
fe2e6726
...
...
@@ -291,33 +291,33 @@ subroutine tridiag_&
if
(
useGPU
)
then
num
=
(
max_local_rows
+1
)
*
size_of_datatype
successGPU
=
gpu_malloc_host
(
v_row_host
,
num
)
check_host_alloc_
cuda
(
"tridiag: v_row_host"
,
successGPU
)
check_host_alloc_
gpu
(
"tridiag: v_row_host"
,
successGPU
)
call
c_f_pointer
(
v_row_host
,
v_row
,(/(
max_local_rows
+1
)/))
num
=
(
max_local_cols
)
*
size_of_datatype
successGPU
=
gpu_malloc_host
(
v_col_host
,
num
)
check_host_alloc_
cuda
(
"tridiag: v_col_host"
,
successGPU
)
check_host_alloc_
gpu
(
"tridiag: v_col_host"
,
successGPU
)
call
c_f_pointer
(
v_col_host
,
v_col
,(/(
max_local_cols
)/))
num
=
(
max_local_cols
)
*
size_of_datatype
successGPU
=
gpu_malloc_host
(
u_col_host
,
num
)
check_host_alloc_
cuda
(
"tridiag: u_col_host"
,
successGPU
)
check_host_alloc_
gpu
(
"tridiag: u_col_host"
,
successGPU
)
call
c_f_pointer
(
u_col_host
,
u_col
,(/(
max_local_cols
)/))
num
=
(
max_local_rows
)
*
size_of_datatype
successGPU
=
gpu_malloc_host
(
u_row_host
,
num
)
check_host_alloc_
cuda
(
"tridiag: u_row_host"
,
successGPU
)
check_host_alloc_
gpu
(
"tridiag: u_row_host"
,
successGPU
)
call
c_f_pointer
(
u_row_host
,
u_row
,(/(
max_local_rows
)/))
num
=
(
max_local_rows
*
2
*
max_stored_uv
)
*
size_of_datatype
successGPU
=
gpu_host_register
(
int
(
loc
(
vu_stored_rows
),
kind
=
c_intptr_t
),
num
,&
gpuHostRegisterDefault
)
check_host_register_
cuda
(
"tridiag: vu_stored_roes"
,
successGPU
)
check_host_register_
gpu
(
"tridiag: vu_stored_roes"
,
successGPU
)
num
=
(
max_local_cols
*
2
*
max_stored_uv
)
*
size_of_datatype
successGPU
=
gpu_host_register
(
int
(
loc
(
uv_stored_cols
),
kind
=
c_intptr_t
),
num
,&
gpuHostRegisterDefault
)
check_host_register_
cuda
(
"tridiag: uv_stored_cols"
,
successGPU
)
check_host_register_
gpu
(
"tridiag: uv_stored_cols"
,
successGPU
)
#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
num
=
na
*
8
...
...
@@ -326,7 +326,7 @@ subroutine tridiag_&
#endif
successGPU
=
gpu_host_register
(
int
(
loc
(
e_vec
),
kind
=
c_intptr_t
),
num
,&
gpuHostRegisterDefault
)
check_host_register_
cuda
(
"tridiag: e_vec"
,
successGPU
)
check_host_register_
gpu
(
"tridiag: e_vec"
,
successGPU
)
#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
num
=
na
*
8
...
...
@@ -335,7 +335,7 @@ subroutine tridiag_&
#endif
successGPU
=
gpu_host_register
(
int
(
loc
(
d_vec
),
kind
=
c_intptr_t
),
num
,&
gpuHostRegisterDefault
)
check_host_register_
cuda
(
"tridiag: d_vec"
,
successGPU
)
check_host_register_
gpu
(
"tridiag: d_vec"
,
successGPU
)
else
allocate
(
v_row
(
max_local_rows
+1
),
stat
=
istat
,
errmsg
=
errorMessage
)
...
...
@@ -374,23 +374,23 @@ subroutine tridiag_&
if
(
useGPU
)
then
successGPU
=
gpu_malloc
(
v_row_dev
,
max_local_rows
*
size_of_datatype
)
check_alloc_
cuda
(
"tridiag: v_row_dev"
,
successGPU
)
check_alloc_
gpu
(
"tridiag: v_row_dev"
,
successGPU
)
successGPU
=
gpu_malloc
(
u_row_dev
,
max_local_rows
*
size_of_datatype
)
check_alloc_
cuda
(
"tridiag: u_row_dev"
,
successGPU
)
check_alloc_
gpu
(
"tridiag: u_row_dev"
,
successGPU
)
successGPU
=
gpu_malloc
(
v_col_dev
,
max_local_cols
*
size_of_datatype
)
check_alloc_
cuda
(
"tridiag: v_col_dev"
,
successGPU
)
check_alloc_
gpu
(
"tridiag: v_col_dev"
,
successGPU
)
successGPU
=
gpu_malloc
(
u_col_dev
,
max_local_cols
*
size_of_datatype
)
check_alloc_
cuda
(
"tridiag: u_col_dev"
,
successGPU
)
check_alloc_
gpu
(
"tridiag: u_col_dev"
,
successGPU
)
successGPU
=
gpu_malloc
(
vu_stored_rows_dev
,
max_local_rows
*
2
*
max_stored_uv
*
size_of_datatype
)
check_alloc_
cuda
(
"tridiag: vu_stored_rows_dev"
,
successGPU
)
check_alloc_
gpu
(
"tridiag: vu_stored_rows_dev"
,
successGPU
)
successGPU
=
gpu_malloc
(
uv_stored_cols_dev
,
max_local_cols
*
2
*
max_stored_uv
*
size_of_datatype
)
check_alloc_
cuda
(
"tridiag: vu_stored_rows_dev"
,
successGPU
)
check_alloc_
gpu
(
"tridiag: vu_stored_rows_dev"
,
successGPU
)
endif
!useGPU
...
...
@@ -417,15 +417,15 @@ subroutine tridiag_&
num
=
matrixRows
*
matrixCols
*
size_of_datatype
successGPU
=
gpu_malloc
(
a_dev
,
num
)
check_alloc_
cuda
(
"tridiag: a_dev"
,
successGPU
)
check_alloc_
gpu
(
"tridiag: a_dev"
,
successGPU
)
successGPU
=
gpu_host_register
(
int
(
loc
(
a_mat
),
kind
=
c_intptr_t
),
num
,&
gpuHostRegisterDefault
)
check_host_register_
cuda
(
"tridiag: a_mat"
,
successGPU
)
check_host_register_
gpu
(
"tridiag: a_mat"
,
successGPU
)
successGPU
=
gpu_memcpy
(
a_dev
,
int
(
loc
(
a_mat
(
1
,
1
)),
kind
=
c_intptr_t
),
&
num
,
gpuMemcpyHostToDevice
)
check_memcpy_
cuda
(
"tridiag: a_dev"
,
successGPU
)
check_memcpy_
gpu
(
"tridiag: a_dev"
,
successGPU
)
endif
! main cycle of tridiagonalization
...
...
@@ -453,7 +453,7 @@ subroutine tridiag_&
successGPU
=
gpu_memcpy
(
int
(
loc
(
v_row
),
kind
=
c_intptr_t
),
&
a_dev
+
a_offset
,
(
l_rows
)
*
size_of_datatype
,
gpuMemcpyDeviceToHost
)
check_memcpy_
cuda
(
"tridiag a_dev 1"
,
successGPU
)
check_memcpy_
gpu
(
"tridiag a_dev 1"
,
successGPU
)
else
v_row
(
1
:
l_rows
)
=
a_mat
(
1
:
l_rows
,
l_cols
+1
)
endif
...
...
@@ -565,19 +565,19 @@ subroutine tridiag_&
if
(
l_rows
>
0
.and.
l_cols
>
0
)
then
if
(
useGPU
)
then
successGPU
=
gpu_memset
(
u_col_dev
,
0
,
l_cols
*
size_of_datatype
)
check_memcpy_
cuda
(
"tridiag: u_col_dev"
,
successGPU
)
check_memcpy_
gpu
(
"tridiag: u_col_dev"
,
successGPU
)
successGPU
=
gpu_memset
(
u_row_dev
,
0
,
l_rows
*
size_of_datatype
)
check_memcpy_
cuda
(
"tridiag: u_row_dev"
,
successGPU
)
check_memcpy_
gpu
(
"tridiag: u_row_dev"
,
successGPU
)
successGPU
=
gpu_memcpy
(
v_col_dev
,
int
(
loc
(
v_col
(
1
)),
kind
=
c_intptr_t
),
&
l_cols
*
size_of_datatype
,
gpuMemcpyHostToDevice
)
check_memcpy_
cuda
(
"tridiag: v_col_dev"
,
successGPU
)
check_memcpy_
gpu
(
"tridiag: v_col_dev"
,
successGPU
)
successGPU
=
gpu_memcpy
(
v_row_dev
,
int
(
loc
(
v_row
(
1
)),
kind
=
c_intptr_t
),
&
l_rows
*
size_of_datatype
,
gpuMemcpyHostToDevice
)
check_memcpy_
cuda
(
"tridiag: v_row_dev"
,
successGPU
)
check_memcpy_
gpu
(
"tridiag: v_row_dev"
,
successGPU
)
endif
! useGU
#ifdef WITH_OPENMP_TRADITIONAL
...
...
@@ -743,11 +743,11 @@ subroutine tridiag_&
successGPU
=
gpu_memcpy
(
int
(
loc
(
u_col
(
1
)),
kind
=
c_intptr_t
),
&
u_col_dev
,
l_cols
*
size_of_datatype
,
gpuMemcpyDeviceToHost
)
check_memcpy_
cuda
(
"tridiag: u_col_dev 1"
,
successGPU
)
check_memcpy_
gpu
(
"tridiag: u_col_dev 1"
,
successGPU
)
successGPU
=
gpu_memcpy
(
int
(
loc
(
u_row
(
1
)),
kind
=
c_intptr_t
),
&
u_row_dev
,
l_rows
*
size_of_datatype
,
gpuMemcpyDeviceToHost
)
check_memcpy_
cuda
(
"tridiag: u_row_dev 1"
,
successGPU
)
check_memcpy_
gpu
(
"tridiag: u_row_dev 1"
,
successGPU
)
endif
! useGPU
#ifdef WITH_OPENMP_TRADITIONAL
...
...
@@ -876,12 +876,12 @@ subroutine tridiag_&
successGPU
=
gpu_memcpy
(
vu_stored_rows_dev
,
int
(
loc
(
vu_stored_rows
(
1
,
1
)),
kind
=
c_intptr_t
),
&
max_local_rows
*
2
*
max_stored_uv
*
&
size_of_datatype
,
gpuMemcpyHostToDevice
)
check_memcpy_
cuda
(
"tridiag: uv_stored_rows_dev"
,
successGPU
)
check_memcpy_
gpu
(
"tridiag: uv_stored_rows_dev"
,
successGPU
)
successGPU
=
gpu_memcpy
(
uv_stored_cols_dev
,
int
(
loc
(
uv_stored_cols
(
1
,
1
)),
kind
=
c_intptr_t
),
&
max_local_cols
*
2
*
max_stored_uv
*
&
size_of_datatype
,
gpuMemcpyHostToDevice
)
check_memcpy_
cuda
(
"tridiag: uv_stored_cols_dev"
,
successGPU
)
check_memcpy_
gpu
(
"tridiag: uv_stored_cols_dev"
,
successGPU
)
endif
do
i
=
0
,
(
istep
-2
)/
tile_size
...
...
@@ -946,7 +946,7 @@ subroutine tridiag_&
successGPU
=
gpu_memcpy
(
int
(
loc
(
a_mat
(
l_rows
,
l_cols
)),
kind
=
c_intptr_t
),
a_dev
+
a_offset
,
&
1
*
size_of_datatype
,
gpuMemcpyDeviceToHost
)
check_memcpy_
cuda
(
"tridiag: a_dev 3"
,
successGPU
)
check_memcpy_
gpu
(
"tridiag: a_dev 3"
,
successGPU
)
endif
if
(
n_stored_vecs
>
0
)
then
...
...
@@ -967,11 +967,11 @@ subroutine tridiag_&
if
(
useGPU
)
then
!a_dev(l_rows,l_cols) = a_mat(l_rows,l_cols)
!successGPU = cuda_threadsynchronize()
!check_memcpy_
cuda
("tridiag: a_dev 4a5a", successGPU)
!check_memcpy_
gpu
("tridiag: a_dev 4a5a", successGPU)
successGPU
=
gpu_memcpy
(
a_dev
+
a_offset
,
int
(
loc
(
a_mat
(
l_rows
,
l_cols
)),
kind
=
c_intptr_t
),
&
int
(
1
*
size_of_datatype
,
kind
=
c_intptr_t
),
gpuMemcpyHostToDevice
)
check_memcpy_
cuda
(
"tridiag: a_dev 4"
,
successGPU
)
check_memcpy_
gpu
(
"tridiag: a_dev 4"
,
successGPU
)
endif
endif
...
...
@@ -986,7 +986,7 @@ subroutine tridiag_&
if
(
useGPU
)
then
successGPU
=
gpu_memcpy
(
int
(
loc
(
aux3
(
1
)),
kind
=
c_intptr_t
),
a_dev
+
(
matrixRows
*
(
l_cols
-
1
))
*
size_of_datatype
,
&
1
*
size_of_datatype
,
gpuMemcpyDeviceToHost
)
check_memcpy_
cuda
(
"tridiag: a_dev 5"
,
successGPU
)
check_memcpy_
gpu
(
"tridiag: a_dev 5"
,
successGPU
)
vrl
=
aux3
(
1
)
else
!useGPU
vrl
=
a_mat
(
1
,
l_cols
)
...
...
@@ -1022,7 +1022,7 @@ subroutine tridiag_&
if
(
useGPU
)
then
successGPU
=
gpu_memcpy
(
int
(
loc
(
aux3
(
1
)),
kind
=
c_intptr_t
),
a_dev
,
&
1
*
size_of_datatype
,
gpuMemcpyDeviceToHost
)
check_memcpy_
cuda
(
"tridiag: a_dev 6"
,
successGPU
)
check_memcpy_
gpu
(
"tridiag: a_dev 6"
,
successGPU
)
d_vec
(
1
)
=
PRECISION_REAL
(
aux3
(
1
))
else
!useGPU
d_vec
(
1
)
=
PRECISION_REAL
(
a_mat
(
1
,
1
))
...
...
@@ -1038,7 +1038,7 @@ subroutine tridiag_&
if
(
useGPU
)
then
successGPU
=
gpu_memcpy
(
int
(
loc
(
e_vec
(
1
)),
kind
=
c_intptr_t
),
a_dev
+
(
matrixRows
*
(
l_cols
-
1
))
*
size_of_datatype
,
&
1
*
size_of_datatype
,
gpuMemcpyDeviceToHost
)
check_memcpy_
cuda
(
"tridiag: a_dev 7"
,
successGPU
)
check_memcpy_
gpu
(
"tridiag: a_dev 7"
,
successGPU
)
else
!useGPU
e_vec
(
1
)
=
a_mat
(
1
,
l_cols
)
! use last l_cols value of loop above
endif
!useGPU
...
...
@@ -1048,7 +1048,7 @@ subroutine tridiag_&
if
(
my_prow
==
prow
(
1
,
nblk
,
np_rows
)
.and.
my_pcol
==
pcol
(
1
,
nblk
,
np_cols
))
then
if
(
useGPU
)
then
successGPU
=
gpu_memcpy
(
int
(
loc
(
d_vec
(
1
)),
kind
=
c_intptr_t
),
a_dev
,
1
*
size_of_datatype
,
gpuMemcpyDeviceToHost
)
check_memcpy_
cuda
(
"tridiag: a_dev 8"
,
successGPU
)
check_memcpy_
gpu
(
"tridiag: a_dev 8"
,
successGPU
)
else
!useGPU
if
(
isSkewsymmetric
)
then
d_vec
(
1
)
=
0.0_rk
...
...
@@ -1065,25 +1065,25 @@ subroutine tridiag_&
if
(
useGPU
)
then
! todo: should we leave a_mat on the device for further use?
successGPU
=
gpu_free
(
a_dev
)
check_dealloc_
cuda
(
"tridiag: a_dev 9"
,
successGPU
)
check_dealloc_
gpu
(
"tridiag: a_dev 9"
,
successGPU
)
successGPU
=
gpu_free
(
v_row_dev
)
check_dealloc_
cuda
(
"tridiag: v_row_dev"
,
successGPU
)
check_dealloc_
gpu
(
"tridiag: v_row_dev"
,
successGPU
)
successGPU
=
gpu_free
(
u_row_dev
)
check_dealloc_
cuda
(
"tridiag: (u_row_dev"
,
successGPU
)
check_dealloc_
gpu
(
"tridiag: (u_row_dev"
,
successGPU
)
successGPU
=
gpu_free
(
v_col_dev
)
check_dealloc_
cuda
(
"tridiag: v_col_dev"
,
successGPU
)
check_dealloc_
gpu
(
"tridiag: v_col_dev"
,
successGPU
)
successGPU
=
gpu_free
(
u_col_dev
)
check_dealloc_
cuda
(
"tridiag: u_col_dev "
,
successGPU
)
check_dealloc_
gpu
(
"tridiag: u_col_dev "
,
successGPU
)
successGPU
=
gpu_free
(
vu_stored_rows_dev
)
check_dealloc_
cuda
(
"tridiag: vu_stored_rows_dev "
,
successGPU
)
check_dealloc_
gpu
(
"tridiag: vu_stored_rows_dev "
,
successGPU
)
successGPU
=
gpu_free
(
uv_stored_cols_dev
)
check_dealloc_
cuda
(
"tridiag:uv_stored_cols_dev "
,
successGPU
)
check_dealloc_
gpu
(
"tridiag:uv_stored_cols_dev "
,
successGPU
)
endif
! distribute the arrays d_vec and e_vec to all processors
...
...
@@ -1113,35 +1113,35 @@ subroutine tridiag_&
if
(
useGPU
)
then
successGPU
=
gpu_host_unregister
(
int
(
loc
(
a_mat
),
kind
=
c_intptr_t
))
check_host_unregister_
cuda
(
"tridiag: a_mat"
,
successGPU
)
check_host_unregister_
gpu
(
"tridiag: a_mat"
,
successGPU
)
successGPU
=
gpu_free_host
(
v_row_host
)
check_host_dealloc_
cuda
(
"tridiag: v_row_host"
,
successGPU
)
check_host_dealloc_
gpu
(
"tridiag: v_row_host"
,
successGPU
)
nullify
(
v_row
)
successGPU
=
gpu_free_host
(
v_col_host
)
check_host_dealloc_
cuda
(
"tridiag: v_col_host"
,
successGPU
)
check_host_dealloc_
gpu
(
"tridiag: v_col_host"
,
successGPU
)
nullify
(
v_col
)
successGPU
=
gpu_free_host
(
u_col_host
)
check_host_dealloc_
cuda
(
"tridiag: u_col_host"
,
successGPU
)
check_host_dealloc_
gpu
(
"tridiag: u_col_host"
,
successGPU
)
nullify
(
u_col
)
successGPU
=
gpu_free_host
(
u_row_host
)
check_host_dealloc_
cuda
(
"tridiag: u_row_host"
,
successGPU
)
check_host_dealloc_
gpu
(
"tridiag: u_row_host"
,
successGPU
)
nullify
(
u_row
)
successGPU
=
gpu_host_unregister
(
int
(
loc
(
uv_stored_cols
),
kind
=
c_intptr_t
))
check_host_unregister_
cuda
(
"tridiag: uv_stored_cols"
,
successGPU
)
check_host_unregister_
gpu
(
"tridiag: uv_stored_cols"
,
successGPU
)
successGPU
=
gpu_host_unregister
(
int
(
loc
(
vu_stored_rows
),
kind
=
c_intptr_t
))
check_host_unregister_
cuda
(
"tridiag: vu_stored_rows"
,
successGPU
)
check_host_unregister_
gpu
(
"tridiag: vu_stored_rows"
,
successGPU
)
successGPU
=
gpu_host_unregister
(
int
(
loc
(
e_vec
),
kind
=
c_intptr_t
))
check_host_unregister_
cuda
(
"tridiag: e_vec"
,
successGPU
)
check_host_unregister_
gpu
(
"tridiag: e_vec"
,
successGPU
)
successGPU
=
gpu_host_unregister
(
int
(
loc
(
d_vec
),
kind
=
c_intptr_t
))
check_host_unregister_
cuda
(
"tridiag: d_vec"
,
successGPU
)
check_host_unregister_
gpu
(
"tridiag: d_vec"
,
successGPU
)
else
deallocate
(
v_row
,
v_col
,
u_row
,
u_col
,
stat
=
istat
,
errmsg
=
errorMessage
)
check_deallocate
(
"tridiag: v_row, v_col, u_row, u_col"
,
istat
,
errorMessage
)
...
...
src/elpa1/elpa_multiply_a_b.F90
View file @
fe2e6726
...
...
@@ -202,34 +202,34 @@
! copy b to b_dev
num
=
ldb
*
ldbCols
*
size_of_datatype
successGPU
=
gpu_malloc
(
b_dev
,
num
)
check_alloc_
cuda
(
"elpa_mult_at_b: b_dev"
,
successGPU
)
check_alloc_
gpu
(
"elpa_mult_at_b: b_dev"
,
successGPU
)
successGPU
=
gpu_host_register
(
int
(
loc
(
b
),
kind
=
c_intptr_t
),
num
,&
cudaHostRegisterDefault
)
check_host_register_
cuda
(
"elpa_mult_at_b: b"
,
successGPU
)
check_host_register_
gpu
(
"elpa_mult_at_b: b"
,
successGPU
)
successGPU
=
gpu_memcpy
(
b_dev
,
int
(
loc
(
b
),
kind
=
c_intptr_t
),
num
,&
cudaMemcpyHostToDevice
)
check_memcpy_
cuda
(
"elpa_mult_at_b: b to b_dev"
,
successGPU
)
check_memcpy_
gpu
(
"elpa_mult_at_b: b to b_dev"
,
successGPU
)
num
=
l_rows
*
nblk_mult
*
size_of_datatype
successGPU
=
gpu_malloc_host
(
aux_host
,
num
)
check_host_alloc_
cuda
(
"elpa_mult_at_b: aux_host"
,
successGPU
)
check_host_alloc_
gpu
(
"elpa_mult_at_b: aux_host"
,
successGPU
)
call
c_f_pointer
(
aux_host
,
aux_mat
,(/
l_rows
,
nblk_mult
/))
successGPU
=
gpu_malloc
(
aux_dev
,
num
)
check_alloc_
cuda
(
"elpa_mult_at_b: aux_dev"
,
successGPU
)
check_alloc_
gpu
(
"elpa_mult_at_b: aux_dev"
,
successGPU
)
num
=
nblk_mult
*
l_cols
*
size_of_datatype
successGPU
=
gpu_malloc_host
(
tmp1_host
,
num
)
check_host_alloc_
cuda
(
"elpa_mult_at_b: tmp1_host"
,
successGPU
)
check_host_alloc_
gpu
(
"elpa_mult_at_b: tmp1_host"
,
successGPU
)
call
c_f_pointer
(
tmp1_host
,
tmp1
,(/
nblk_mult
,
l_cols
/))
successGPU
=
gpu_malloc
(
tmp1_dev
,
num
)
check_alloc_
cuda
(
"elpa_mult_at_b: tmp1_dev"
,
successGPU
)
check_alloc_
gpu
(
"elpa_mult_at_b: tmp1_dev"
,
successGPU
)
else
! useGPU
allocate
(
aux_mat
(
l_rows
,
nblk_mult
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"elpa_mult_at_b: aux_mat"
,
istat
,
errorMessage
)
...
...
@@ -357,7 +357,7 @@
num
=
l_rows
*
nblk_mult
*
size_of_datatype
successGPU
=
gpu_memcpy
(
aux_dev
,
int
(
loc
(
aux_mat
),
kind
=
c_intptr_t
),
&
num
,
gpuMemcpyHostToDevice
)
check_memcpy_
cuda
(
"elpa_mult_at_b: aux_mat to aux_dev"
,
successGPU
)
check_memcpy_
gpu
(
"elpa_mult_at_b: aux_mat to aux_dev"
,
successGPU
)
aux_off
=
(
lrs
-1
)
*
size_of_datatype
b_off
=
((
lcs
-1
)
*
ldb
+
lrs
-1
)
*
size_of_datatype
...
...
@@ -371,7 +371,7 @@
num
=
nstor
*
(
lce
-
lcs
+1
)
*
size_of_datatype
successGPU
=
gpu_memcpy
(
int
(
loc
(
tmp1
),
kind
=
c_intptr_t
),
&
tmp1_dev
,
num
,
gpuMemcpyDeviceToHost
)
check_memcpy_
cuda
(
"elpa_mult_at_b: tmp1_dev to tmp1"
,
successGPU
)
check_memcpy_
gpu
(
"elpa_mult_at_b: tmp1_dev to tmp1"
,
successGPU
)
else
! useGPU
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMM
(
BLAS_TRANS_OR_CONJ
,
'N'
,
int
(
nstor
,
kind
=
BLAS_KIND
),
&
...
...
@@ -414,25 +414,25 @@
if
(
useGPU
)
then
successGPU
=
gpu_free
(
b_dev
)
check_dealloc_
cuda
(
"elpa_multiply_a_b: b_dev"
,
successGPU
)
check_dealloc_
gpu
(
"elpa_multiply_a_b: b_dev"
,
successGPU
)
successGPU
=
gpu_host_unregister
(
int
(
loc
(
b
),
kind
=
c_intptr_t
))
check_host_unregister_
cuda
(
"elpa_multiply_a_b: b"
,
successGPU
)
check_host_unregister_
gpu
(
"elpa_multiply_a_b: b"
,
successGPU
)
nullify
(
aux_mat
)
nullify
(
tmp1
)
successGPU
=
gpu_free_host
(
aux_host
)
check_host_dealloc_
cuda
(
"elpa_multiply_a_b: aux_host"
,
successGPU
)
check_host_dealloc_
gpu
(
"elpa_multiply_a_b: aux_host"
,
successGPU
)
successGPU
=
gpu_free
(
aux_dev
)
check_dealloc_
cuda
(
"elpa_multiply_a_b: aux_dev"
,
successGPU
)
check_dealloc_
gpu
(
"elpa_multiply_a_b: aux_dev"
,
successGPU
)
successGPU
=
gpu_free_host
(
tmp1_host
)
check_host_dealloc_
cuda
(
"elpa_multiply_a_b: tmp1_host"
,
successGPU
)
check_host_dealloc_
gpu
(
"elpa_multiply_a_b: tmp1_host"
,
successGPU
)
successGPU
=
gpu_free
(
tmp1_dev
)
check_dealloc_
cuda
(
"elpa_multiply_a_b: tmp1_dev"
,
successGPU
)
check_dealloc_
gpu
(
"elpa_multiply_a_b: tmp1_dev"
,
successGPU
)
else
! useGPU
deallocate
(
aux_mat
,
stat
=
istat
,
errmsg
=
errorMessage
)
check_deallocate
(
"elpa_mult_at_b: aux_mat"
,
istat
,
errorMessage
)
...
...
src/elpa2/elpa2_bandred_template.F90
View file @
fe2e6726
...
...
@@ -290,15 +290,15 @@ max_threads)
! Here we convert the regular host array into a pinned host array
successGPU
=
gpu_malloc
(
a_dev
,
lda
*
na_cols
*
size_of_datatype
)
check_alloc_
cuda
(
"bandred: a_dev"
,
successGPU
)
check_alloc_
gpu
(
"bandred: a_dev"
,
successGPU
)
successGPU
=
gpu_host_register
(
int
(
loc
(
vav
),
kind
=
c_intptr_t
),
&
nbw
*
nbw
*
size_of_datatype
,&
gpuHostRegisterDefault
)
check_host_register_
cuda
(
"bandred: vav"
,
successGPU
)
check_host_register_
gpu
(
"bandred: vav"
,
successGPU
)
successGPU
=
gpu_malloc
(
vav_dev
,
nbw
*
nbw
*
size_of_datatype
)
check_alloc_
cuda
(
"bandred: vav_dev"
,
successGPU
)
check_alloc_
gpu
(
"bandred: vav_dev"
,
successGPU
)
endif
! useGPU
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
...
...
@@ -373,17 +373,17 @@ max_threads)
successGPU
=
gpu_host_register
(
int
(
loc
(
a_mat
),
kind
=
c_intptr_t
),
&
lda
*
na_cols
*
size_of_datatype
,
gpuHostRegisterDefault
)
check_host_register_
cuda
(
"bandred: a_mat"
,
successGPU
)
check_host_register_
gpu
(
"bandred: a_mat"
,
successGPU
)
cur_l_rows
=
0
cur_l_cols
=
0
successGPU
=
gpu_memcpy
(
a_dev
,
int
(
loc
(
a_mat
),
kind
=
c_intptr_t
),
&
lda
*
na_cols
*
size_of_datatype
,
gpuMemcpyHostToDevice
)
check_memcpy_
cuda
(
"bandred: a_dev"
,
successGPU
)
check_memcpy_
gpu
(
"bandred: a_dev"
,
successGPU
)
successGPU
=
gpu_malloc
(
tmat_dev
,
nbw
*
nbw
*
size_of_datatype
)
check_alloc_
cuda
(
"bandred: tmat_dev"
,
successGPU
)
check_alloc_
gpu
(
"bandred: tmat_dev"
,
successGPU
)
istep
=
(
na
-1
)/
nbw
blk_end
=
(
na
-1
)/
nbw
...
...
@@ -417,18 +417,18 @@ max_threads)
endif