Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
d9b473c0
Commit
d9b473c0
authored
Mar 24, 2020
by
Andreas Marek
Browse files
Simplify checking of cuda return codes
parent
88cbfbf3
Changes
4
Pipelines
1
Show whitespace changes
Inline
Side-by-side
src/elpa1/elpa1_compute_template.F90
View file @
d9b473c0
...
...
@@ -62,7 +62,10 @@
#define check_memcpy_cuda(file, success) call check_memcpy_CUDA_f(file, __LINE__, success)
#define check_alloc_cuda(file, success) call check_alloc_CUDA_f(file, __LINE__, success)
#define check_dealloc_cuda(file, success) call check_dealloc_CUDA_f(file, __LINE__, success)
#define check_host_register_cuda(file, success) call check_host_register_CUDA_f(file, __LINE__, success)
#define check_host_unregister_cuda(file, success) call check_host_unregister_CUDA_f(file, __LINE__, success)
#define check_host_alloc_cuda(file, success) call check_host_alloc_CUDA_f(file, __LINE__, success)
#define check_host_dealloc_cuda(file, success) call check_host_dealloc_CUDA_f(file, __LINE__, success)
#endif
#if REALCASE == 1
...
...
src/elpa1/elpa1_tridiag_template.F90
View file @
d9b473c0
...
...
@@ -157,19 +157,16 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
integer
(
kind
=
c_intptr_t
)
::
num
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
tmp
(:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
v_row
(:)
! used to store calculated Householder Vector
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
v_row
(:),
&
! used to store calculated Householder Vector
v_col
(:)
! the same Vector, but transposed
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
u_col
(:),
u_row
(:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
&
! used to store calculated Householder Vector
v_col
(:),
&
! the same Vector, but transposed
! - differently distributed among MPI tasks
u_row
(:),
&
u_col
(:)
! the following two matrices store pairs of vectors v and u calculated in each step
! at most max_stored_uv Vector pairs are stored, than the matrix A_i is explicitli updated
! u and v are stored both in row and Vector forms
! pattern: v1,u1,v2,u2,v3,u3,....
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
vu_stored_rows
(:,:)
MATH_DATATYPE
(
kind
=
rck
),
pointer
::
vu_stored_rows
(:,:)
! pattern: u1,v1,u2,v2,u3,v3,....
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
uv_stored_cols
(:,:)
...
...
@@ -177,6 +174,9 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
ur_p
(:,:),
uc_p
(:,:)
#endif
type
(
c_ptr
)
::
v_row_host
,
v_col_host
type
(
c_ptr
)
::
u_row_host
,
u_col_host
type
(
c_ptr
)
::
vu_stored_rows_host
,
uv_stored_cols_host
real
(
kind
=
rk
),
allocatable
::
tmp_real
(:)
integer
(
kind
=
ik
)
::
min_tile_size
,
error
integer
(
kind
=
ik
)
::
istat
...
...
@@ -276,13 +276,13 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
&MATH_DATATYPE "
,
"tmp"
,
istat
,
errorMessage
)
! allocate v_row 1 element longer to allow store and broadcast tau together with it
allocate
(
v_
row
(
max_local_rows
+1
),
stat
=
istat
,
errmsg
=
errorMessage
)
allocate
(
u
v_
stored_cols
(
max_local_cols
,
2
*
max_stored_uv
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"v_
row
"
,
istat
,
errorMessage
)
&MATH_DATATYPE "
,
"
u
v_
stored_cols
"
,
istat
,
errorMessage
)
allocate
(
u
_row
(
max_local_rows
),
stat
=
istat
,
errmsg
=
errorMessage
)
allocate
(
v
_row
(
max_local_rows
+1
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"
u
_row"
,
istat
,
errorMessage
)
&MATH_DATATYPE "
,
"
v
_row"
,
istat
,
errorMessage
)
allocate
(
v_col
(
max_local_cols
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
...
...
@@ -292,6 +292,14 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"u_col"
,
istat
,
errorMessage
)
allocate
(
u_row
(
max_local_rows
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"u_row"
,
istat
,
errorMessage
)
allocate
(
vu_stored_rows
(
max_local_rows
,
2
*
max_stored_uv
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"vu_stored_rows"
,
istat
,
errorMessage
)
#ifdef WITH_OPENMP
allocate
(
ur_p
(
max_local_rows
,
0
:
max_threads
-1
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
...
...
@@ -308,14 +316,6 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
v_col
=
0
u_col
=
0
allocate
(
vu_stored_rows
(
max_local_rows
,
2
*
max_stored_uv
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"vu_stored_rows"
,
istat
,
errorMessage
)
allocate
(
uv_stored_cols
(
max_local_cols
,
2
*
max_stored_uv
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"uv_stored_cols"
,
istat
,
errorMessage
)
if
(
useGPU
)
then
successCUDA
=
cuda_malloc
(
v_row_dev
,
max_local_rows
*
size_of_datatype
)
check_alloc_cuda
(
"tridiag: v_row_dev"
,
successCUDA
)
...
...
@@ -407,7 +407,8 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
int
(
l_rows
,
kind
=
BLAS_KIND
),
int
(
2
*
n_stored_vecs
,
kind
=
BLAS_KIND
),
&
ONE
,
vu_stored_rows
,
int
(
ubound
(
vu_stored_rows
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
#if REALCASE == 1
uv_stored_cols
(
l_cols
+1
,
1
),
int
(
ubound
(
uv_stored_cols
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
uv_stored_cols
(
l_cols
+1
,
1
),
&
int
(
ubound
(
uv_stored_cols
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
#endif
#if COMPLEXCASE == 1
aux
,
1_BLAS_KIND
,
&
...
...
@@ -418,7 +419,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
endif
if
(
my_prow
==
prow
(
istep
-1
,
nblk
,
np_rows
))
then
if
(
my_prow
==
prow
(
istep
-1
,
nblk
,
np_rows
))
then
aux1
(
1
)
=
dot_product
(
v_row
(
1
:
l_rows
-1
),
v_row
(
1
:
l_rows
-1
))
aux1
(
2
)
=
v_row
(
l_rows
)
else
...
...
@@ -503,7 +504,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
u_col
(
1
:
l_cols
)
=
0
u_row
(
1
:
l_rows
)
=
0
if
(
l_rows
>
0
.and.
l_cols
>
0
)
then
if
(
useGPU
)
then
if
(
useGPU
)
then
successCUDA
=
cuda_memset
(
u_col_dev
,
0
,
l_cols
*
size_of_datatype
)
check_memcpy_cuda
(
"tridiag: u_col_dev"
,
successCUDA
)
...
...
@@ -553,12 +554,14 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if
(
i
/
=
j
)
then
if
(
isSkewsymmetric
)
then
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
-
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
v_col
(
l_col_beg
),
1_BLAS_KIND
,
&
-
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
&
v_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
,
&
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
else
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
v_col
(
l_col_beg
),
1_BLAS_KIND
,
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
&
v_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
,
&
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
endif
endif
...
...
@@ -575,19 +578,21 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
call
PRECISION_GEMV
(
BLAS_TRANS_OR_CONJ
,
&
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
&
v_row
(
l_row_beg
),
1_BLAS_KIND
,
&
ONE
,
u_col
(
l_col_beg
),
1_BLAS_KIND
)
v_row
(
l_row_beg
:
max_local_rows
+1
),
1_BLAS_KIND
,
&
ONE
,
u_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
)
if
(
i
/
=
j
)
then
if
(
isSkewsymmetric
)
then
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
-
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
&
v_col
(
l_col_beg
),
1_BLAS_KIND
,
ONE
,
u_row
(
l_row_beg
),
1_BLAS_KIND
)
v_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
,
ONE
,
u_row
(
l_row_beg
:
max_local_rows
),
&
1_BLAS_KIND
)
else
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
&
v_col
(
l_col_beg
),
1_BLAS_KIND
,
ONE
,
u_row
(
l_row_beg
),
1_BLAS_KIND
)
v_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
,
ONE
,
u_row
(
l_row_beg
:
max_local_rows
),
&
1_BLAS_KIND
)
endif
endif
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"blas"
)
...
...
@@ -805,7 +810,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
successCUDA
=
cuda_memcpy
(
vu_stored_rows_dev
,
int
(
loc
(
vu_stored_rows
(
1
,
1
)),
kind
=
c_intptr_t
),
&
max_local_rows
*
2
*
max_stored_uv
*
&
size_of_datatype
,
cudaMemcpyHostToDevice
)
check_memcpy_cuda
(
"tridiag:
v
u_stored_rows_dev"
,
successCUDA
)
check_memcpy_cuda
(
"tridiag: u
v
_stored_rows_dev"
,
successCUDA
)
successCUDA
=
cuda_memcpy
(
uv_stored_cols_dev
,
int
(
loc
(
uv_stored_cols
(
1
,
1
)),
kind
=
c_intptr_t
),
&
max_local_cols
*
2
*
max_stored_uv
*
&
...
...
@@ -843,8 +848,10 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
call
PRECISION_GEMM
(
'N'
,
BLAS_TRANS_OR_CONJ
,
&
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
int
(
2
*
n_stored_vecs
,
kind
=
BLAS_KIND
),
&
ONE
,
vu_stored_rows
(
l_row_beg
,
1
),
int
(
ubound
(
vu_stored_rows
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
uv_stored_cols
(
l_col_beg
,
1
),
int
(
ubound
(
uv_stored_cols
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
ONE
,
vu_stored_rows
(
l_row_beg
:
max_local_rows
,
1
:
2
*
max_stored_uv
),
&
int
(
ubound
(
vu_stored_rows
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
uv_stored_cols
(
l_col_beg
,
1
),
&
int
(
ubound
(
uv_stored_cols
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
))
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"blas"
)
endif
!useGPU
...
...
@@ -988,7 +995,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
endif
#endif
deallocate
(
tmp
,
v_row
,
u_row
,
v_col
,
u_col
,
vu_stored_rows
,
uv_stored_cols
,
stat
=
istat
,
errmsg
=
errorMessage
)
deallocate
(
tmp
,
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
print
*
,
"tridiag: error when deallocating "
//
errorMessage
stop
1
...
...
@@ -1049,7 +1056,11 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
print
*
,
"tridiag: error when deallocating tmp_real "
//
errorMessage
stop
1
endif
deallocate
(
v_row
,
v_col
,
u_row
,
u_col
,
vu_stored_rows
,
uv_stored_cols
,
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
print
*
,
"tridiag: error when deallocating "
//
errorMessage
stop
1
endif
call
obj
%
timer
%
stop
(
"tridiag_&
&MATH_DATATYPE&
...
...
src/elpa1/elpa_multiply_a_b.F90
View file @
d9b473c0
...
...
@@ -52,6 +52,15 @@
! Author: A. Marek, MPCDF
!cannot use __FILE__ because filename with path can be too long for gfortran (max line length)
#define check_memcpy_cuda(file, success) call check_memcpy_CUDA_f(file, __LINE__, success)
#define check_alloc_cuda(file, success) call check_alloc_CUDA_f(file, __LINE__, success)
#define check_dealloc_cuda(file, success) call check_dealloc_CUDA_f(file, __LINE__, success)
#define check_host_register_cuda(file, success) call check_host_register_CUDA_f(file, __LINE__, success)
#define check_host_unregister_cuda(file, success) call check_host_unregister_CUDA_f(file, __LINE__, success)
#define check_host_alloc_cuda(file, success) call check_host_alloc_CUDA_f(file, __LINE__, success)
#define check_host_dealloc_cuda(file, success) call check_host_dealloc_CUDA_f(file, __LINE__, success)
#include "../general/sanity.F90"
use
elpa1_compute
...
...
@@ -193,67 +202,34 @@
! copy b to b_dev
num
=
ldb
*
ldbCols
*
size_of_datatype
successCUDA
=
cuda_malloc
(
b_dev
,
num
)
if
(
.not.
successCUDA
)
then
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMalloc b_dev"
stop
endif
check_alloc_cuda
(
"elpa_mult_at_b: b_dev"
,
successCUDA
)
successCUDA
=
cuda_host_register
(
int
(
loc
(
b
),
kind
=
c_intptr_t
),
num
,&
cudaHostRegisterDefault
)
if
(
.not.
successCUDA
)
then
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaHostRegister b"
stop
endif
check_host_register_cuda
(
"elpa_mult_at_b: b"
,
successCUDA
)
successCUDA
=
cuda_memcpy
(
b_dev
,
int
(
loc
(
b
),
kind
=
c_intptr_t
),
num
,&
cudaMemcpyHostToDevice
)
if
(
.not.
successCUDA
)
then
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMemcpy, b H2D"
endif
check_memcpy_cuda
(
"elpa_mult_at_b: b to b_dev"
,
successCUDA
)
num
=
l_rows
*
nblk_mult
*
size_of_datatype
successCUDA
=
cuda_malloc_host
(
aux_host
,
num
)
if
(
.not.
successCUDA
)
then
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMallocHost aux"
stop
endif
check_host_alloc_cuda
(
"elpa_mult_at_b: aux_host"
,
successCUDA
)
call
c_f_pointer
(
aux_host
,
aux_mat
,(/
l_rows
,
nblk_mult
/))
successCUDA
=
cuda_malloc
(
aux_dev
,
num
)
if
(
.not.
successCUDA
)
then
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMalloc aux_dev"
stop
endif
check_alloc_cuda
(
"elpa_mult_at_b: aux_dev"
,
successCUDA
)
num
=
nblk_mult
*
l_cols
*
size_of_datatype
successCUDA
=
cuda_malloc_host
(
tmp1_host
,
num
)
if
(
.not.
successCUDA
)
then
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMallocHost tmp1_host"
stop
endif
check_host_alloc_cuda
(
"elpa_mult_at_b: tmp1_host"
,
successCUDA
)
call
c_f_pointer
(
tmp1_host
,
tmp1
,(/
nblk_mult
,
l_cols
/))
successCUDA
=
cuda_malloc
(
tmp1_dev
,
num
)
if
(
.not.
successCUDA
)
then
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMalloc tmp1_dev"
stop
endif
check_alloc_cuda
(
"elpa_mult_at_b: tmp1_dev"
,
successCUDA
)
else
! useGPU
allocate
(
aux_mat
(
l_rows
,
nblk_mult
),
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
...
...
@@ -265,28 +241,16 @@
endif
! useGPU
allocate
(
aux_bc
(
l_rows
*
nblk
),
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error when allocating aux_bc "
//
errorMessage
stop
endif
call
check_alloc
(
"elpa_mult_at_b_&
&MATH_DATATYPE "
,
"aux_bc"
,
istat
,
errorMessage
)
allocate
(
lrs_save
(
nblk
),
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error when allocating lrs_save "
//
errorMessage
stop
endif
call
check_alloc
(
"elpa_mult_at_b_&
&MATH_DATATYPE "
,
"lrs_save"
,
istat
,
errorMessage
)
allocate
(
lre_save
(
nblk
),
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error when allocating lre_save "
//
errorMessage
stop
endif
call
check_alloc
(
"elpa_mult_at_b_&
&MATH_DATATYPE "
,
"lre_save"
,
istat
,
errorMessage
)
a_lower
=
.false.
a_upper
=
.false.
...
...
@@ -393,24 +357,15 @@
if
(
lcs
<=
lce
)
then
allocate
(
tmp1
(
nstor
,
lcs
:
lce
),
tmp2
(
nstor
,
lcs
:
lce
),
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error when allocating tmp1 "
//
errorMessage
stop
endif
call
check_alloc
(
"elpa_mult_at_b_&
&MATH_DATATYPE "
,
"tmp1"
,
istat
,
errorMessage
)
if
(
lrs
<=
lre
)
then
if
(
useGPU
)
then
num
=
l_rows
*
nblk_mult
*
size_of_datatype
successCUDA
=
cuda_memcpy
(
aux_dev
,
int
(
loc
(
aux_mat
),
kind
=
c_intptr_t
),
&
num
,
cudaMemcpyHostToDevice
)
if
(
.not.
successCUDA
)
then
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMemcpy aux_mat H2D"
stop
endif
check_memcpy_cuda
(
"elpa_mult_at_b: aux_mat to aux_dev"
,
successCUDA
)
aux_off
=
(
lrs
-1
)
*
size_of_datatype
b_off
=
((
lcs
-1
)
*
ldb
+
lrs
-1
)
*
size_of_datatype
...
...
@@ -424,12 +379,7 @@
num
=
nstor
*
(
lce
-
lcs
+1
)
*
size_of_datatype
successCUDA
=
cuda_memcpy
(
int
(
loc
(
tmp1
),
kind
=
c_intptr_t
),
&
tmp1_dev
,
num
,
cudaMemcpyDeviceToHost
)
if
(
.not.
successCUDA
)
then
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMemcpy tmp1 D2H"
stop
endif
check_memcpy_cuda
(
"elpa_mult_at_b: tmp1_dev to tmp1"
,
successCUDA
)
else
! useGPU
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMM
(
BLAS_TRANS_OR_CONJ
,
'N'
,
int
(
nstor
,
kind
=
BLAS_KIND
),
&
...
...
@@ -478,55 +428,25 @@
if
(
useGPU
)
then
successCUDA
=
cuda_free
(
b_dev
)
if
(
.not.
successCUDA
)
then
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaFree b_dev"
stop
endif
check_dealloc_cuda
(
"elpa_multiply_a_b: b_dev"
,
successCUDA
)
successCUDA
=
cuda_host_unregister
(
int
(
loc
(
b
),
kind
=
c_intptr_t
))
if
(
.not.
successCUDA
)
then
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaHostUnregister b"
stop
endif
check_host_unregister_cuda
(
"elpa_multiply_a_b: b"
,
successCUDA
)
nullify
(
aux_mat
)
nullify
(
tmp1
)
successCUDA
=
cuda_free_host
(
aux_host
)
if
(
.not.
successCUDA
)
then
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaFreeHost aux_host"
stop
endif
check_host_dealloc_cuda
(
"elpa_multiply_a_b: aux_host"
,
successCUDA
)
successCUDA
=
cuda_free
(
aux_dev
)
if
(
.not.
successCUDA
)
then
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaFree aux_dev"
stop
endif
check_dealloc_cuda
(
"elpa_multiply_a_b: aux_dev"
,
successCUDA
)
successCUDA
=
cuda_free_host
(
tmp1_host
)
if
(
.not.
successCUDA
)
then
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaFreeHost tmp1_host"
stop
1
endif
check_host_dealloc_cuda
(
"elpa_multiply_a_b: tmp1_host"
,
successCUDA
)
successCUDA
=
cuda_free
(
tmp1_dev
)
if
(
.not.
successCUDA
)
then
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaFree tmp1_dev"
stop
endif
check_dealloc_cuda
(
"elpa_multiply_a_b: tmp1_dev"
,
successCUDA
)
else
! useGPU
deallocate
(
aux_mat
,
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
...
...
src/general/elpa_utilities.F90
View file @
d9b473c0
...
...
@@ -61,6 +61,7 @@ module ELPA_utilities
public
::
output_unit
,
error_unit
public
::
check_alloc
,
check_alloc_CUDA_f
,
check_memcpy_CUDA_f
,
check_dealloc_CUDA_f
public
::
check_host_alloc_CUDA_f
,
check_host_dealloc_CUDA_f
,
check_host_register_CUDA_f
,
check_host_unregister_CUDA_f
public
::
map_global_array_index_to_local_index
public
::
pcol
,
prow
public
::
local_index
! Get local index of a block cyclic distributed matrix
...
...
@@ -206,4 +207,59 @@ module ELPA_utilities
endif
end
subroutine
subroutine
check_host_alloc_CUDA_f
(
file_name
,
line
,
successCUDA
)
implicit
none
character
(
len
=*
),
intent
(
in
)
::
file_name
integer
(
kind
=
c_int
),
intent
(
in
)
::
line
logical
::
successCUDA
if
(
.not.
(
successCUDA
))
then
print
*
,
file_name
,
":"
,
line
,
" error in cuda_alloc_host when allocating "
stop
1
endif
end
subroutine
subroutine
check_host_dealloc_CUDA_f
(
file_name
,
line
,
successCUDA
)
implicit
none
character
(
len
=*
),
intent
(
in
)
::
file_name
integer
(
kind
=
c_int
),
intent
(
in
)
::
line
logical
::
successCUDA
if
(
.not.
(
successCUDA
))
then
print
*
,
file_name
,
":"
,
line
,
" error in cuda_free_host when deallocating "
stop
1
endif
end
subroutine
subroutine
check_host_register_CUDA_f
(
file_name
,
line
,
successCUDA
)
implicit
none
character
(
len
=*
),
intent
(
in
)
::
file_name
integer
(
kind
=
c_int
),
intent
(
in
)
::
line
logical
::
successCUDA
if
(
.not.
(
successCUDA
))
then
print
*
,
file_name
,
":"
,
line
,
" error in cuda_host_register when registering "
stop
1
endif
end
subroutine
subroutine
check_host_unregister_CUDA_f
(
file_name
,
line
,
successCUDA
)
implicit
none
character
(
len
=*
),
intent
(
in
)
::
file_name
integer
(
kind
=
c_int
),
intent
(
in
)
::
line
logical
::
successCUDA
if
(
.not.
(
successCUDA
))
then
print
*
,
file_name
,
":"
,
line
,
" error in cuda_host_unregister when unregistering "
stop
1
endif
end
subroutine
end
module
ELPA_utilities
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment