Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
d9b473c0
Commit
d9b473c0
authored
Mar 24, 2020
by
Andreas Marek
Browse files
Simplify checking of cuda return codes
parent
88cbfbf3
Changes
4
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
src/elpa1/elpa1_compute_template.F90
View file @
d9b473c0
...
@@ -62,7 +62,10 @@
...
@@ -62,7 +62,10 @@
#define check_memcpy_cuda(file, success) call check_memcpy_CUDA_f(file, __LINE__, success)
#define check_memcpy_cuda(file, success) call check_memcpy_CUDA_f(file, __LINE__, success)
#define check_alloc_cuda(file, success) call check_alloc_CUDA_f(file, __LINE__, success)
#define check_alloc_cuda(file, success) call check_alloc_CUDA_f(file, __LINE__, success)
#define check_dealloc_cuda(file, success) call check_dealloc_CUDA_f(file, __LINE__, success)
#define check_dealloc_cuda(file, success) call check_dealloc_CUDA_f(file, __LINE__, success)
#define check_host_register_cuda(file, success) call check_host_register_CUDA_f(file, __LINE__, success)
#define check_host_unregister_cuda(file, success) call check_host_unregister_CUDA_f(file, __LINE__, success)
#define check_host_alloc_cuda(file, success) call check_host_alloc_CUDA_f(file, __LINE__, success)
#define check_host_dealloc_cuda(file, success) call check_host_dealloc_CUDA_f(file, __LINE__, success)
#endif
#endif
#if REALCASE == 1
#if REALCASE == 1
...
...
src/elpa1/elpa1_tridiag_template.F90
View file @
d9b473c0
...
@@ -157,19 +157,16 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
...
@@ -157,19 +157,16 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
integer
(
kind
=
c_intptr_t
)
::
num
integer
(
kind
=
c_intptr_t
)
::
num
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
tmp
(:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
tmp
(:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
v_row
(:)
! used to store calculated Householder Vector
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
v_row
(:),
&
! used to store calculated Householder Vector
v_col
(:)
! the same Vector, but transposed
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
u_col
(:),
u_row
(:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
&
! used to store calculated Householder Vector
v_col
(:),
&
! the same Vector, but transposed
! - differently distributed among MPI tasks
u_row
(:),
&
u_col
(:)
! the following two matrices store pairs of vectors v and u calculated in each step
! the following two matrices store pairs of vectors v and u calculated in each step
! at most max_stored_uv Vector pairs are stored, than the matrix A_i is explicitli updated
! at most max_stored_uv Vector pairs are stored, than the matrix A_i is explicitli updated
! u and v are stored both in row and Vector forms
! u and v are stored both in row and Vector forms
! pattern: v1,u1,v2,u2,v3,u3,....
! pattern: v1,u1,v2,u2,v3,u3,....
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
vu_stored_rows
(:,:)
MATH_DATATYPE
(
kind
=
rck
),
pointer
::
vu_stored_rows
(:,:)
! pattern: u1,v1,u2,v2,u3,v3,....
! pattern: u1,v1,u2,v2,u3,v3,....
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
uv_stored_cols
(:,:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
uv_stored_cols
(:,:)
...
@@ -177,6 +174,9 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
...
@@ -177,6 +174,9 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
ur_p
(:,:),
uc_p
(:,:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
ur_p
(:,:),
uc_p
(:,:)
#endif
#endif
type
(
c_ptr
)
::
v_row_host
,
v_col_host
type
(
c_ptr
)
::
u_row_host
,
u_col_host
type
(
c_ptr
)
::
vu_stored_rows_host
,
uv_stored_cols_host
real
(
kind
=
rk
),
allocatable
::
tmp_real
(:)
real
(
kind
=
rk
),
allocatable
::
tmp_real
(:)
integer
(
kind
=
ik
)
::
min_tile_size
,
error
integer
(
kind
=
ik
)
::
min_tile_size
,
error
integer
(
kind
=
ik
)
::
istat
integer
(
kind
=
ik
)
::
istat
...
@@ -276,22 +276,30 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
...
@@ -276,22 +276,30 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
&MATH_DATATYPE "
,
"tmp"
,
istat
,
errorMessage
)
&MATH_DATATYPE "
,
"tmp"
,
istat
,
errorMessage
)
! allocate v_row 1 element longer to allow store and broadcast tau together with it
! allocate v_row 1 element longer to allow store and broadcast tau together with it
allocate
(
v_row
(
max_local_rows
+1
),
stat
=
istat
,
errmsg
=
errorMessage
)
allocate
(
uv_stored_cols
(
max_local_cols
,
2
*
max_stored_uv
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"v_row"
,
istat
,
errorMessage
)
&MATH_DATATYPE "
,
"uv_stored_cols"
,
istat
,
errorMessage
)
allocate
(
u_row
(
max_local_rows
),
stat
=
istat
,
errmsg
=
errorMessage
)
allocate
(
v_row
(
max_local_rows
+1
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"u_row"
,
istat
,
errorMessage
)
&MATH_DATATYPE "
,
"v_row"
,
istat
,
errorMessage
)
allocate
(
v_col
(
max_local_cols
),
stat
=
istat
,
errmsg
=
errorMessage
)
allocate
(
v_col
(
max_local_cols
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"v_col"
,
istat
,
errorMessage
)
&MATH_DATATYPE "
,
"v_col"
,
istat
,
errorMessage
)
allocate
(
u_col
(
max_local_cols
),
stat
=
istat
,
errmsg
=
errorMessage
)
allocate
(
u_col
(
max_local_cols
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"u_col"
,
istat
,
errorMessage
)
&MATH_DATATYPE "
,
"u_col"
,
istat
,
errorMessage
)
allocate
(
u_row
(
max_local_rows
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"u_row"
,
istat
,
errorMessage
)
allocate
(
vu_stored_rows
(
max_local_rows
,
2
*
max_stored_uv
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"vu_stored_rows"
,
istat
,
errorMessage
)
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
allocate
(
ur_p
(
max_local_rows
,
0
:
max_threads
-1
),
stat
=
istat
,
errmsg
=
errorMessage
)
allocate
(
ur_p
(
max_local_rows
,
0
:
max_threads
-1
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
call
check_alloc
(
"tridiag_&
...
@@ -308,14 +316,6 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
...
@@ -308,14 +316,6 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
v_col
=
0
v_col
=
0
u_col
=
0
u_col
=
0
allocate
(
vu_stored_rows
(
max_local_rows
,
2
*
max_stored_uv
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"vu_stored_rows"
,
istat
,
errorMessage
)
allocate
(
uv_stored_cols
(
max_local_cols
,
2
*
max_stored_uv
),
stat
=
istat
,
errmsg
=
errorMessage
)
call
check_alloc
(
"tridiag_&
&MATH_DATATYPE "
,
"uv_stored_cols"
,
istat
,
errorMessage
)
if
(
useGPU
)
then
if
(
useGPU
)
then
successCUDA
=
cuda_malloc
(
v_row_dev
,
max_local_rows
*
size_of_datatype
)
successCUDA
=
cuda_malloc
(
v_row_dev
,
max_local_rows
*
size_of_datatype
)
check_alloc_cuda
(
"tridiag: v_row_dev"
,
successCUDA
)
check_alloc_cuda
(
"tridiag: v_row_dev"
,
successCUDA
)
...
@@ -398,39 +398,40 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
...
@@ -398,39 +398,40 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
v_row
(
1
:
l_rows
)
=
a_mat
(
1
:
l_rows
,
l_cols
+1
)
v_row
(
1
:
l_rows
)
=
a_mat
(
1
:
l_rows
,
l_cols
+1
)
endif
endif
if
(
n_stored_vecs
>
0
.and.
l_rows
>
0
)
then
if
(
n_stored_vecs
>
0
.and.
l_rows
>
0
)
then
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"blas"
)
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"blas"
)
#if COMPLEXCASE == 1
#if COMPLEXCASE == 1
aux
(
1
:
2
*
n_stored_vecs
)
=
conjg
(
uv_stored_cols
(
l_cols
+1
,
1
:
2
*
n_stored_vecs
))
aux
(
1
:
2
*
n_stored_vecs
)
=
conjg
(
uv_stored_cols
(
l_cols
+1
,
1
:
2
*
n_stored_vecs
))
#endif
#endif
call
PRECISION_GEMV
(
'N'
,
&
call
PRECISION_GEMV
(
'N'
,
&
int
(
l_rows
,
kind
=
BLAS_KIND
),
int
(
2
*
n_stored_vecs
,
kind
=
BLAS_KIND
),
&
int
(
l_rows
,
kind
=
BLAS_KIND
),
int
(
2
*
n_stored_vecs
,
kind
=
BLAS_KIND
),
&
ONE
,
vu_stored_rows
,
int
(
ubound
(
vu_stored_rows
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
ONE
,
vu_stored_rows
,
int
(
ubound
(
vu_stored_rows
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
#if REALCASE == 1
#if REALCASE == 1
uv_stored_cols
(
l_cols
+1
,
1
),
int
(
ubound
(
uv_stored_cols
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
uv_stored_cols
(
l_cols
+1
,
1
),
&
int
(
ubound
(
uv_stored_cols
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
#endif
#endif
#if COMPLEXCASE == 1
#if COMPLEXCASE == 1
aux
,
1_BLAS_KIND
,
&
aux
,
1_BLAS_KIND
,
&
#endif
#endif
ONE
,
v_row
,
1_BLAS_KIND
)
ONE
,
v_row
,
1_BLAS_KIND
)
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"blas"
)
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"blas"
)
endif
endif
if
(
my_prow
==
prow
(
istep
-1
,
nblk
,
np_rows
))
then
if
(
my_prow
==
prow
(
istep
-1
,
nblk
,
np_rows
))
then
aux1
(
1
)
=
dot_product
(
v_row
(
1
:
l_rows
-1
),
v_row
(
1
:
l_rows
-1
))
aux1
(
1
)
=
dot_product
(
v_row
(
1
:
l_rows
-1
),
v_row
(
1
:
l_rows
-1
))
aux1
(
2
)
=
v_row
(
l_rows
)
aux1
(
2
)
=
v_row
(
l_rows
)
else
else
aux1
(
1
)
=
dot_product
(
v_row
(
1
:
l_rows
),
v_row
(
1
:
l_rows
))
aux1
(
1
)
=
dot_product
(
v_row
(
1
:
l_rows
),
v_row
(
1
:
l_rows
))
aux1
(
2
)
=
0.
aux1
(
2
)
=
0.
endif
endif
#ifdef WITH_MPI
#ifdef WITH_MPI
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"mpi_communication"
)
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_allreduce
(
aux1
,
aux2
,
2_MPI_KIND
,
MPI_MATH_DATATYPE_PRECISION
,
&
call
mpi_allreduce
(
aux1
,
aux2
,
2_MPI_KIND
,
MPI_MATH_DATATYPE_PRECISION
,
&
MPI_SUM
,
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
),
mpierr
)
MPI_SUM
,
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
),
mpierr
)
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#else /* WITH_MPI */
#else /* WITH_MPI */
aux2
=
aux1
aux2
=
aux1
#endif /* WITH_MPI */
#endif /* WITH_MPI */
...
@@ -484,26 +485,26 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
...
@@ -484,26 +485,26 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#endif /* WITH_MPI */
#endif /* WITH_MPI */
!recover tau, which has been broadcasted together with v_row
!recover tau, which has been broadcasted together with v_row
tau
(
istep
)
=
v_row
(
l_rows
+1
)
tau
(
istep
)
=
v_row
(
l_rows
+1
)
! Transpose Householder Vector v_row -> v_col
! Transpose Householder Vector v_row -> v_col
call
elpa_transpose_vectors_
&
call
elpa_transpose_vectors_
&
&
MATH_DATATYPE
&
&
MATH_DATATYPE
&
&
_
&
&
_
&
&
PRECISION
&
&
PRECISION
&
(
obj
,
v_row
,
ubound
(
v_row
,
dim
=
1
),
mpi_comm_rows
,
v_col
,
ubound
(
v_col
,
dim
=
1
),
mpi_comm_cols
,
&
(
obj
,
v_row
,
ubound
(
v_row
,
dim
=
1
),
mpi_comm_rows
,
v_col
,
ubound
(
v_col
,
dim
=
1
),
mpi_comm_cols
,
&
1
,
istep
-1
,
1
,
nblk
,
max_threads
)
1
,
istep
-1
,
1
,
nblk
,
max_threads
)
! Calculate u = (A + VU**T + UV**T)*v
! Calculate u = (A + VU**T + UV**T)*v
! For cache efficiency, we use only the upper half of the matrix tiles for this,
! For cache efficiency, we use only the upper half of the matrix tiles for this,
! thus the result is partly in u_col(:) and partly in u_row(:)
! thus the result is partly in u_col(:) and partly in u_row(:)
u_col
(
1
:
l_cols
)
=
0
u_col
(
1
:
l_cols
)
=
0
u_row
(
1
:
l_rows
)
=
0
u_row
(
1
:
l_rows
)
=
0
if
(
l_rows
>
0
.and.
l_cols
>
0
)
then
if
(
l_rows
>
0
.and.
l_cols
>
0
)
then
if
(
useGPU
)
then
if
(
useGPU
)
then
successCUDA
=
cuda_memset
(
u_col_dev
,
0
,
l_cols
*
size_of_datatype
)
successCUDA
=
cuda_memset
(
u_col_dev
,
0
,
l_cols
*
size_of_datatype
)
check_memcpy_cuda
(
"tridiag: u_col_dev"
,
successCUDA
)
check_memcpy_cuda
(
"tridiag: u_col_dev"
,
successCUDA
)
...
@@ -553,12 +554,14 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
...
@@ -553,12 +554,14 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if
(
i
/
=
j
)
then
if
(
i
/
=
j
)
then
if
(
isSkewsymmetric
)
then
if
(
isSkewsymmetric
)
then
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
-
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
v_col
(
l_col_beg
),
1_BLAS_KIND
,
&
-
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
&
v_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
,
&
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
else
else
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
v_col
(
l_col_beg
),
1_BLAS_KIND
,
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
&
v_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
,
&
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
ONE
,
ur_p
(
l_row_beg
,
my_thread
),
1_BLAS_KIND
)
endif
endif
endif
endif
...
@@ -575,19 +578,21 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
...
@@ -575,19 +578,21 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
call
PRECISION_GEMV
(
BLAS_TRANS_OR_CONJ
,
&
call
PRECISION_GEMV
(
BLAS_TRANS_OR_CONJ
,
&
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
&
v_row
(
l_row_beg
),
1_BLAS_KIND
,
&
v_row
(
l_row_beg
:
max_local_rows
+1
),
1_BLAS_KIND
,
&
ONE
,
u_col
(
l_col_beg
),
1_BLAS_KIND
)
ONE
,
u_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
)
if
(
i
/
=
j
)
then
if
(
i
/
=
j
)
then
if
(
isSkewsymmetric
)
then
if
(
isSkewsymmetric
)
then
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
-
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
&
-
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
&
v_col
(
l_col_beg
),
1_BLAS_KIND
,
ONE
,
u_row
(
l_row_beg
),
1_BLAS_KIND
)
v_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
,
ONE
,
u_row
(
l_row_beg
:
max_local_rows
),
&
1_BLAS_KIND
)
else
else
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
call
PRECISION_GEMV
(
'N'
,
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
),
&
v_col
(
l_col_beg
),
1_BLAS_KIND
,
ONE
,
u_row
(
l_row_beg
),
1_BLAS_KIND
)
v_col
(
l_col_beg
:
max_local_cols
),
1_BLAS_KIND
,
ONE
,
u_row
(
l_row_beg
:
max_local_rows
),
&
1_BLAS_KIND
)
endif
endif
endif
endif
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"blas"
)
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"blas"
)
...
@@ -805,7 +810,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
...
@@ -805,7 +810,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
successCUDA
=
cuda_memcpy
(
vu_stored_rows_dev
,
int
(
loc
(
vu_stored_rows
(
1
,
1
)),
kind
=
c_intptr_t
),
&
successCUDA
=
cuda_memcpy
(
vu_stored_rows_dev
,
int
(
loc
(
vu_stored_rows
(
1
,
1
)),
kind
=
c_intptr_t
),
&
max_local_rows
*
2
*
max_stored_uv
*
&
max_local_rows
*
2
*
max_stored_uv
*
&
size_of_datatype
,
cudaMemcpyHostToDevice
)
size_of_datatype
,
cudaMemcpyHostToDevice
)
check_memcpy_cuda
(
"tridiag:
v
u_stored_rows_dev"
,
successCUDA
)
check_memcpy_cuda
(
"tridiag: u
v
_stored_rows_dev"
,
successCUDA
)
successCUDA
=
cuda_memcpy
(
uv_stored_cols_dev
,
int
(
loc
(
uv_stored_cols
(
1
,
1
)),
kind
=
c_intptr_t
),
&
successCUDA
=
cuda_memcpy
(
uv_stored_cols_dev
,
int
(
loc
(
uv_stored_cols
(
1
,
1
)),
kind
=
c_intptr_t
),
&
max_local_cols
*
2
*
max_stored_uv
*
&
max_local_cols
*
2
*
max_stored_uv
*
&
...
@@ -843,8 +848,10 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
...
@@ -843,8 +848,10 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
call
PRECISION_GEMM
(
'N'
,
BLAS_TRANS_OR_CONJ
,
&
call
PRECISION_GEMM
(
'N'
,
BLAS_TRANS_OR_CONJ
,
&
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
int
(
l_row_end
-
l_row_beg
+1
,
kind
=
BLAS_KIND
),
int
(
l_col_end
-
l_col_beg
+1
,
kind
=
BLAS_KIND
),
&
int
(
2
*
n_stored_vecs
,
kind
=
BLAS_KIND
),
&
int
(
2
*
n_stored_vecs
,
kind
=
BLAS_KIND
),
&
ONE
,
vu_stored_rows
(
l_row_beg
,
1
),
int
(
ubound
(
vu_stored_rows
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
ONE
,
vu_stored_rows
(
l_row_beg
:
max_local_rows
,
1
:
2
*
max_stored_uv
),
&
uv_stored_cols
(
l_col_beg
,
1
),
int
(
ubound
(
uv_stored_cols
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
int
(
ubound
(
vu_stored_rows
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
uv_stored_cols
(
l_col_beg
,
1
),
&
int
(
ubound
(
uv_stored_cols
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
))
ONE
,
a_mat
(
l_row_beg
,
l_col_beg
),
int
(
lda
,
kind
=
BLAS_KIND
))
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"blas"
)
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"blas"
)
endif
!useGPU
endif
!useGPU
...
@@ -988,7 +995,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
...
@@ -988,7 +995,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
endif
endif
#endif
#endif
deallocate
(
tmp
,
v_row
,
u_row
,
v_col
,
u_col
,
vu_stored_rows
,
uv_stored_cols
,
stat
=
istat
,
errmsg
=
errorMessage
)
deallocate
(
tmp
,
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
if
(
istat
.ne.
0
)
then
print
*
,
"tridiag: error when deallocating "
//
errorMessage
print
*
,
"tridiag: error when deallocating "
//
errorMessage
stop
1
stop
1
...
@@ -1049,7 +1056,11 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
...
@@ -1049,7 +1056,11 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
print
*
,
"tridiag: error when deallocating tmp_real "
//
errorMessage
print
*
,
"tridiag: error when deallocating tmp_real "
//
errorMessage
stop
1
stop
1
endif
endif
deallocate
(
v_row
,
v_col
,
u_row
,
u_col
,
vu_stored_rows
,
uv_stored_cols
,
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
print
*
,
"tridiag: error when deallocating "
//
errorMessage
stop
1
endif
call
obj
%
timer
%
stop
(
"tridiag_&
call
obj
%
timer
%
stop
(
"tridiag_&
&MATH_DATATYPE&
&MATH_DATATYPE&
...
...
src/elpa1/elpa_multiply_a_b.F90
View file @
d9b473c0
...
@@ -52,6 +52,15 @@
...
@@ -52,6 +52,15 @@
! Author: A. Marek, MPCDF
! Author: A. Marek, MPCDF
!cannot use __FILE__ because filename with path can be too long for gfortran (max line length)
#define check_memcpy_cuda(file, success) call check_memcpy_CUDA_f(file, __LINE__, success)
#define check_alloc_cuda(file, success) call check_alloc_CUDA_f(file, __LINE__, success)
#define check_dealloc_cuda(file, success) call check_dealloc_CUDA_f(file, __LINE__, success)
#define check_host_register_cuda(file, success) call check_host_register_CUDA_f(file, __LINE__, success)
#define check_host_unregister_cuda(file, success) call check_host_unregister_CUDA_f(file, __LINE__, success)
#define check_host_alloc_cuda(file, success) call check_host_alloc_CUDA_f(file, __LINE__, success)
#define check_host_dealloc_cuda(file, success) call check_host_dealloc_CUDA_f(file, __LINE__, success)
#include "../general/sanity.F90"
#include "../general/sanity.F90"
use
elpa1_compute
use
elpa1_compute
...
@@ -193,67 +202,34 @@
...
@@ -193,67 +202,34 @@
! copy b to b_dev
! copy b to b_dev
num
=
ldb
*
ldbCols
*
size_of_datatype
num
=
ldb
*
ldbCols
*
size_of_datatype
successCUDA
=
cuda_malloc
(
b_dev
,
num
)
successCUDA
=
cuda_malloc
(
b_dev
,
num
)
if
(
.not.
successCUDA
)
then
check_alloc_cuda
(
"elpa_mult_at_b: b_dev"
,
successCUDA
)
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMalloc b_dev"
stop
endif
successCUDA
=
cuda_host_register
(
int
(
loc
(
b
),
kind
=
c_intptr_t
),
num
,&
successCUDA
=
cuda_host_register
(
int
(
loc
(
b
),
kind
=
c_intptr_t
),
num
,&
cudaHostRegisterDefault
)
cudaHostRegisterDefault
)
if
(
.not.
successCUDA
)
then
print
*
,
"elpa_mult_at_b_&
check_host_register_cuda
(
"elpa_mult_at_b: b"
,
successCUDA
)
&MATH_DATATYPE&
&: error in cudaHostRegister b"
stop
endif
successCUDA
=
cuda_memcpy
(
b_dev
,
int
(
loc
(
b
),
kind
=
c_intptr_t
),
num
,&
successCUDA
=
cuda_memcpy
(
b_dev
,
int
(
loc
(
b
),
kind
=
c_intptr_t
),
num
,&
cudaMemcpyHostToDevice
)
cudaMemcpyHostToDevice
)
if
(
.not.
successCUDA
)
then
check_memcpy_cuda
(
"elpa_mult_at_b: b to b_dev"
,
successCUDA
)
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMemcpy, b H2D"
endif
num
=
l_rows
*
nblk_mult
*
size_of_datatype
num
=
l_rows
*
nblk_mult
*
size_of_datatype
successCUDA
=
cuda_malloc_host
(
aux_host
,
num
)
successCUDA
=
cuda_malloc_host
(
aux_host
,
num
)
if
(
.not.
successCUDA
)
then
check_host_alloc_cuda
(
"elpa_mult_at_b: aux_host"
,
successCUDA
)
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMallocHost aux"
stop
endif
call
c_f_pointer
(
aux_host
,
aux_mat
,(/
l_rows
,
nblk_mult
/))
call
c_f_pointer
(
aux_host
,
aux_mat
,(/
l_rows
,
nblk_mult
/))
successCUDA
=
cuda_malloc
(
aux_dev
,
num
)
successCUDA
=
cuda_malloc
(
aux_dev
,
num
)
if
(
.not.
successCUDA
)
then
check_alloc_cuda
(
"elpa_mult_at_b: aux_dev"
,
successCUDA
)
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMalloc aux_dev"
stop
endif
num
=
nblk_mult
*
l_cols
*
size_of_datatype
num
=
nblk_mult
*
l_cols
*
size_of_datatype
successCUDA
=
cuda_malloc_host
(
tmp1_host
,
num
)
successCUDA
=
cuda_malloc_host
(
tmp1_host
,
num
)
if
(
.not.
successCUDA
)
then
check_host_alloc_cuda
(
"elpa_mult_at_b: tmp1_host"
,
successCUDA
)
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMallocHost tmp1_host"
stop
endif
call
c_f_pointer
(
tmp1_host
,
tmp1
,(/
nblk_mult
,
l_cols
/))
call
c_f_pointer
(
tmp1_host
,
tmp1
,(/
nblk_mult
,
l_cols
/))
successCUDA
=
cuda_malloc
(
tmp1_dev
,
num
)
successCUDA
=
cuda_malloc
(
tmp1_dev
,
num
)
if
(
.not.
successCUDA
)
then
check_alloc_cuda
(
"elpa_mult_at_b: tmp1_dev"
,
successCUDA
)
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMalloc tmp1_dev"
stop
endif
else
! useGPU
else
! useGPU
allocate
(
aux_mat
(
l_rows
,
nblk_mult
),
stat
=
istat
,
errmsg
=
errorMessage
)
allocate
(
aux_mat
(
l_rows
,
nblk_mult
),
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
if
(
istat
.ne.
0
)
then
...
@@ -265,28 +241,16 @@
...
@@ -265,28 +241,16 @@
endif
! useGPU
endif
! useGPU
allocate
(
aux_bc
(
l_rows
*
nblk
),
stat
=
istat
,
errmsg
=
errorMessage
)
allocate
(
aux_bc
(
l_rows
*
nblk
),
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
call
check_alloc
(
"elpa_mult_at_b_&
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE "
,
"aux_bc"
,
istat
,
errorMessage
)
&MATH_DATATYPE&
&: error when allocating aux_bc "
//
errorMessage
stop
endif
allocate
(
lrs_save
(
nblk
),
stat
=
istat
,
errmsg
=
errorMessage
)
allocate
(
lrs_save
(
nblk
),
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
call
check_alloc
(
"elpa_mult_at_b_&
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE "
,
"lrs_save"
,
istat
,
errorMessage
)
&MATH_DATATYPE&
&: error when allocating lrs_save "
//
errorMessage
stop
endif
allocate
(
lre_save
(
nblk
),
stat
=
istat
,
errmsg
=
errorMessage
)
allocate
(
lre_save
(
nblk
),
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
call
check_alloc
(
"elpa_mult_at_b_&
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE "
,
"lre_save"
,
istat
,
errorMessage
)
&MATH_DATATYPE&
&: error when allocating lre_save "
//
errorMessage
stop
endif
a_lower
=
.false.
a_lower
=
.false.
a_upper
=
.false.
a_upper
=
.false.
...
@@ -393,24 +357,15 @@
...
@@ -393,24 +357,15 @@
if
(
lcs
<=
lce
)
then
if
(
lcs
<=
lce
)
then
allocate
(
tmp1
(
nstor
,
lcs
:
lce
),
tmp2
(
nstor
,
lcs
:
lce
),
stat
=
istat
,
errmsg
=
errorMessage
)
allocate
(
tmp1
(
nstor
,
lcs
:
lce
),
tmp2
(
nstor
,
lcs
:
lce
),
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
call
check_alloc
(
"elpa_mult_at_b_&
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE "
,
"tmp1"
,
istat
,
errorMessage
)
&MATH_DATATYPE&
&: error when allocating tmp1 "
//
errorMessage
stop
endif
if
(
lrs
<=
lre
)
then
if
(
lrs
<=
lre
)
then
if
(
useGPU
)
then
if
(
useGPU
)
then
num
=
l_rows
*
nblk_mult
*
size_of_datatype
num
=
l_rows
*
nblk_mult
*
size_of_datatype
successCUDA
=
cuda_memcpy
(
aux_dev
,
int
(
loc
(
aux_mat
),
kind
=
c_intptr_t
),
&
successCUDA
=
cuda_memcpy
(
aux_dev
,
int
(
loc
(
aux_mat
),
kind
=
c_intptr_t
),
&
num
,
cudaMemcpyHostToDevice
)
num
,
cudaMemcpyHostToDevice
)
if
(
.not.
successCUDA
)
then
check_memcpy_cuda
(
"elpa_mult_at_b: aux_mat to aux_dev"
,
successCUDA
)
print
*
,
"elpa_mult_at_b_&
&MATH_DATATYPE&
&: error in cudaMemcpy aux_mat H2D"
stop
endif
aux_off
=
(
lrs
-1
)
*
size_of_datatype
aux_off
=
(
lrs
-1
)
*
size_of_datatype
b_off
=
((
lcs
-1
)
*
ldb
+
lrs
-1
)
*
size_of_datatype
b_off
=
((
lcs
-1
)
*
ldb
+
lrs
-1
)
*
size_of_datatype
...
@@ -424,12 +379,7 @@
...
@@ -424,12 +379,7 @@
num
=
nstor
*
(
lce
-
lcs
+1
)
*
size_of_datatype