Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
80951583
Commit
80951583
authored
May 08, 2018
by
Andreas Marek
Browse files
Merge branch 'master_pre_stage' of
https://gitlab.mpcdf.mpg.de/elpa/elpa
into master_pre_stage
parents
a9f1fc4b
2f62f542
Changes
11
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
.gitlab-ci.yml
View file @
80951583
This diff is collapsed.
Click to expand it.
ci_test_scripts/generate_gitlab_ci_tests.py
View file @
80951583
...
...
@@ -583,6 +583,16 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
if
(
instr
!=
"avx"
and
instr
!=
"sse"
and
addr
==
"address-sanitize"
):
MasterOnly
=
True
# make non-master tests even faster
# kicking out gpu is not good, but at the momemt we have a real problem with gpu runners
# should be returned when solved
if
(
g
==
"with-gpu"
):
MasterOnly
=
True
if
(
a
==
"no-assumed-size"
):
MasterOnly
=
True
if
(
instr
==
"avx2"
or
instr
==
"avx512"
):
MasterOnly
=
True
print
(
"# "
+
cc
+
"-"
+
fc
+
"-"
+
m
+
"-"
+
o
+
"-"
+
p
+
"-"
+
a
+
"-"
+
b
+
"-"
+
g
+
"-"
+
cov
+
"-"
+
instr
+
"-"
+
addr
)
print
(
cc
+
"-"
+
fc
+
"-"
+
m
+
"-"
+
o
+
"-"
+
p
+
"-"
+
a
+
"-"
+
b
+
"-"
+
g
+
"-"
+
cov
+
"-"
+
instr
+
"-"
+
addr
+
"-jobs:"
)
if
(
MasterOnly
):
...
...
src/elpa1/elpa1_trans_ev_template.F90
View file @
80951583
...
...
@@ -98,61 +98,28 @@
use
precision
use
elpa_abstract_impl
implicit
none
#include "../general/precision_kinds.F90"
class
(
elpa_abstract_impl_t
),
intent
(
inout
)
::
obj
integer
(
kind
=
ik
),
intent
(
in
)
::
na
,
nqc
,
lda
,
ldq
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
#if REALCASE == 1
real
(
kind
=
REAL_DATATYPE
),
intent
(
in
)
::
tau
(
na
)
#endif
#if COMPLEXCASE == 1
complex
(
kind
=
COMPLEX_DATATYPE
),
intent
(
in
)
::
tau
(
na
)
#endif
MATH_DATATYPE
(
kind
=
rck
),
intent
(
in
)
::
tau
(
na
)
#if REALCASE == 1
#ifdef USE_ASSUMED_SIZE
real
(
kind
=
REAL_DATATYPE
),
intent
(
inout
)
::
a_mat
(
lda
,
*
),
q_mat
(
ldq
,
*
)
MATH_DATATYPE
(
kind
=
rck
),
intent
(
inout
)
::
a_mat
(
lda
,
*
),
q_mat
(
ldq
,
*
)
#else
real
(
kind
=
REAL_DATATYPE
),
intent
(
inout
)
::
a_mat
(
lda
,
matrixCols
),
q_mat
(
ldq
,
matrixCols
)
#endif
#endif
#if COMPLEXCASE == 1
#ifdef USE_ASSUMED_SIZE
complex
(
kind
=
COMPLEX_DATATYPE
),
intent
(
inout
)
::
a_mat
(
lda
,
*
),
q_mat
(
ldq
,
*
)
#else
complex
(
kind
=
COMPLEX_DATATYPE
),
intent
(
inout
)
::
a_mat
(
lda
,
matrixCols
),
q_mat
(
ldq
,
matrixCols
)
#endif
MATH_DATATYPE
(
kind
=
rck
),
intent
(
inout
)
::
a_mat
(
lda
,
matrixCols
),
q_mat
(
ldq
,
matrixCols
)
#endif
logical
,
intent
(
in
)
::
useGPU
integer
(
kind
=
ik
)
::
max_stored_rows
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
real
(
kind
=
rk8
),
parameter
::
ZERO
=
0.0_rk8
,
ONE
=
1.0_rk8
#else
real
(
kind
=
rk4
),
parameter
::
ZERO
=
0.0_rk4
,
ONE
=
1.0_rk4
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
complex
(
kind
=
ck8
),
parameter
::
ZERO
=
(
0.0_rk8
,
0.0_rk8
),
ONE
=
(
1.0_rk8
,
0.0_rk8
)
#else
complex
(
kind
=
ck4
),
parameter
::
ZERO
=
(
0.0_rk4
,
0.0_rk4
),
ONE
=
(
1.0_rk4
,
0.0_rk4
)
#endif
#endif
integer
(
kind
=
ik
)
::
my_prow
,
my_pcol
,
np_rows
,
np_cols
,
mpierr
integer
(
kind
=
ik
)
::
totalblocks
,
max_blocks_row
,
max_blocks_col
,
max_local_rows
,
max_local_cols
integer
(
kind
=
ik
)
::
l_cols
,
l_rows
,
l_colh
,
nstor
integer
(
kind
=
ik
)
::
istep
,
n
,
nc
,
ic
,
ics
,
ice
,
nb
,
cur_pcol
integer
(
kind
=
ik
)
::
hvn_ubnd
,
hvm_ubnd
#if REALCASE == 1
real
(
kind
=
REAL_DATATYPE
),
allocatable
::
tmp1
(:),
tmp2
(:),
hvb
(:),
hvm
(:,:)
real
(
kind
=
REAL_DATATYPE
),
allocatable
::
tmat
(:,:),
h1
(:),
h2
(:),
hvm1
(:)
#endif
#if COMPLEXCASE == 1
complex
(
kind
=
COMPLEX_DATATYPE
),
allocatable
::
tmp1
(:),
tmp2
(:),
hvb
(:),
hvm
(:,:)
complex
(
kind
=
COMPLEX_DATATYPE
),
allocatable
::
tmat
(:,:),
h1
(:),
h2
(:),
hvm1
(:)
#endif
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
tmp1
(:),
tmp2
(:),
hvb
(:),
hvm
(:,:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
tmat
(:,:),
h1
(:),
h2
(:),
hvm1
(:)
integer
(
kind
=
ik
)
::
istat
character
(
200
)
::
errorMessage
character
(
20
)
::
gpuString
...
...
@@ -294,15 +261,7 @@
#ifdef WITH_MPI
call
obj
%
timer
%
start
(
"mpi_communication"
)
if
(
nb
>
0
)
&
call
MPI_Bcast
(
hvb
,
nb
,
&
#if REALCASE == 1
&
MPI_REAL_PRECISION
&
#endif
#if COMPLEXCASE == 1
&
MPI_COMPLEX_PRECISION
&
#endif
,
cur_pcol
,
mpi_comm_cols
,
mpierr
)
call
MPI_Bcast
(
hvb
,
nb
,
MPI_MATH_DATATYPE_PRECISION
,
cur_pcol
,
mpi_comm_cols
,
mpierr
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#endif /* WITH_MPI */
...
...
@@ -341,14 +300,7 @@
enddo
#ifdef WITH_MPI
call
obj
%
timer
%
start
(
"mpi_communication"
)
if
(
nc
>
0
)
call
mpi_allreduce
(
h1
,
h2
,
nc
,
&
#if REALCASE == 1
&
MPI_REAL_PRECISION
&
#endif
#if COMPLEXCASE == 1
&
MPI_COMPLEX_PRECISION
&
#endif
&,
MPI_SUM
,
mpi_comm_rows
,
mpierr
)
if
(
nc
>
0
)
call
mpi_allreduce
(
h1
,
h2
,
nc
,
MPI_MATH_DATATYPE_PRECISION
,
MPI_SUM
,
mpi_comm_rows
,
mpierr
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#else /* WITH_MPI */
...
...
@@ -361,13 +313,7 @@
tmat
(
1
,
1
)
=
tau
(
ice
-
nstor
+1
)
do
n
=
1
,
nstor
-1
call
obj
%
timer
%
start
(
"blas"
)
#if REALCASE == 1
call
PRECISION_TRMV
(
'L'
,
'T'
,
'N'
,
&
#endif
#if COMPLEXCASE == 1
call
PRECISION_TRMV
(
'L'
,
'C'
,
'N'
,
&
#endif
n
,
tmat
,
max_stored_rows
,
h2
(
nc
+1
),
1
)
call
PRECISION_TRMV
(
'L'
,
BLAS_TRANS_OR_CONJ
,
'N'
,
n
,
tmat
,
max_stored_rows
,
h2
(
nc
+1
),
1
)
call
obj
%
timer
%
stop
(
"blas"
)
tmat
(
n
+1
,
1
:
n
)
=
&
...
...
@@ -404,25 +350,15 @@
if
(
l_rows
>
0
)
then
if
(
useGPU
)
then
call
obj
%
timer
%
start
(
"cublas"
)
#if REALCASE == 1
call
cublas_PRECISION_GEMM
(
'T'
,
'N'
,
&
#endif
#if COMPLEXCASE == 1
call
cublas_PRECISION_GEMM
(
'C'
,
'N'
,
&
#endif
call
cublas_PRECISION_GEMM
(
BLAS_TRANS_OR_CONJ
,
'N'
,
&
nstor
,
l_cols
,
l_rows
,
ONE
,
hvm_dev
,
hvm_ubnd
,
&
q_dev
,
ldq
,
ZERO
,
tmp_dev
,
nstor
)
q_dev
,
ldq
,
ZERO
,
tmp_dev
,
nstor
)
call
obj
%
timer
%
stop
(
"cublas"
)
else
! useGPU
call
obj
%
timer
%
start
(
"blas"
)
#if REALCASE == 1
call
PRECISION_GEMM
(
'T'
,
'N'
,
&
#endif
#if COMPLEXCASE == 1
call
PRECISION_GEMM
(
'C'
,
'N'
,
&
#endif
call
PRECISION_GEMM
(
BLAS_TRANS_OR_CONJ
,
'N'
,
&
nstor
,
l_cols
,
l_rows
,
ONE
,
hvm
,
ubound
(
hvm
,
dim
=
1
),
&
q_mat
,
ldq
,
ZERO
,
tmp1
,
nstor
)
call
obj
%
timer
%
stop
(
"blas"
)
...
...
@@ -447,14 +383,7 @@
check_memcpy_cuda
(
"trans_ev"
,
successCUDA
)
endif
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_allreduce
(
tmp1
,
tmp2
,
nstor
*
l_cols
,
&
#if REALCASE == 1
&
MPI_REAL_PRECISION
&
#endif
#if COMPLEXCASE == 1
&
MPI_COMPLEX_PRECISION
&
#endif
&,
MPI_SUM
,
mpi_comm_rows
,
mpierr
)
call
mpi_allreduce
(
tmp1
,
tmp2
,
nstor
*
l_cols
,
MPI_MATH_DATATYPE_PRECISION
,
MPI_SUM
,
mpi_comm_rows
,
mpierr
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
! copy back tmp2 - after reduction...
if
(
useGPU
)
then
...
...
src/elpa1/elpa1_tridiag_template.F90
View file @
80951583
...
...
@@ -111,27 +111,13 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
integer
(
kind
=
ik
),
intent
(
in
)
::
na
,
lda
,
nblk
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
logical
,
intent
(
in
)
::
useGPU
,
wantDebug
#if REALCASE == 1
real
(
kind
=
REAL_DATATYPE
),
intent
(
out
)
::
tau
(
na
)
#endif
#if COMPLEXCASE == 1
complex
(
kind
=
COMPLEX_DATATYPE
),
intent
(
out
)
::
tau
(
na
)
#endif
#if REALCASE == 1
#ifdef USE_ASSUMED_SIZE
real
(
kind
=
REAL_DATATYPE
),
intent
(
inout
)
::
a_mat
(
lda
,
*
)
#else
real
(
kind
=
REAL_DATATYPE
),
intent
(
inout
)
::
a_mat
(
lda
,
matrixCols
)
#endif
#endif
#if COMPLEXCASE == 1
MATH_DATATYPE
(
kind
=
rck
),
intent
(
out
)
::
tau
(
na
)
#ifdef USE_ASSUMED_SIZE
complex
(
kind
=
COMPLEX_DATATYPE
),
intent
(
inout
)
::
a_mat
(
lda
,
*
)
MATH_DATATYPE
(
kind
=
rck
),
intent
(
inout
)
::
a_mat
(
lda
,
*
)
#else
complex
(
kind
=
COMPLEX_DATATYPE
),
intent
(
inout
)
::
a_mat
(
lda
,
matrixCols
)
MATH_DATATYPE
(
kind
=
rck
),
intent
(
inout
)
::
a_mat
(
lda
,
matrixCols
)
#endif
#endif
real
(
kind
=
REAL_DATATYPE
),
intent
(
out
)
::
d_vec
(
na
),
e_vec
(
na
)
real
(
kind
=
rk
),
intent
(
out
)
::
d_vec
(
na
),
e_vec
(
na
)
integer
(
kind
=
ik
),
parameter
::
max_stored_uv
=
32
logical
,
parameter
::
mat_vec_as_one_block
=
.true.
...
...
@@ -160,50 +146,31 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
integer
(
kind
=
ik
)
::
my_thread
,
n_threads
,
n_iter
#endif
real
(
kind
=
REAL_DATATYPE
)
::
vnorm2
#if REALCASE == 1
real
(
kind
=
REAL_DATATYPE
)
::
vav
,
x
,
aux
(
2
*
max_stored_uv
),
aux1
(
2
),
aux2
(
2
),
vrl
,
xf
#endif
real
(
kind
=
rk
)
::
vnorm2
MATH_DATATYPE
(
kind
=
rck
)
::
vav
,
x
,
aux
(
2
*
max_stored_uv
),
aux1
(
2
),
aux2
(
2
),
vrl
,
xf
#if COMPLEXCASE == 1
complex
(
kind
=
COMPLEX_DATATYPE
)
::
vav
,
xc
,
aux
(
2
*
max_stored_uv
),
aux1
(
2
),
aux2
(
2
),
aux3
(
1
),
vrl
,
xf
complex
(
kind
=
rck
)
::
aux3
(
1
)
#endif
#if REALCASE == 1
real
(
kind
=
REAL_DATATYPE
),
allocatable
::
tmp
(:),
&
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
tmp
(:),
&
v_row
(:),
&
! used to store calculated Householder Vector
v_col
(:),
&
! the same Vector, but transposed - differently distributed among MPI tasks
u_row
(:),
&
u_col
(:)
#endif
#if COMPLEXCASE == 1
complex
(
kind
=
COMPLEX_DATATYPE
),
allocatable
::
tmp
(:),
v_row
(:),
v_col
(:),
u_row
(:),
u_col
(:)
#endif
! the following two matrices store pairs of vectors v and u calculated in each step
! at most max_stored_uv Vector pairs are stored, than the matrix A_i is explicitli updated
! u and v are stored both in row and Vector forms
! pattern: v1,u1,v2,u2,v3,u3,....
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
#if REALCASE == 1
real
(
kind
=
REAL_DATATYPE
),
allocatable
::
vu_stored_rows
(:,:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
vu_stored_rows
(:,:)
! pattern: u1,v1,u2,v2,u3,v3,....
real
(
kind
=
REAL_DATATYPE
),
allocatable
::
uv_stored_cols
(:,:)
#endif
#if COMPLEXCASE == 1
complex
(
kind
=
COMPLEX_DATATYPE
),
allocatable
::
vu_stored_rows
(:,:),
uv_stored_cols
(:,:)
#endif
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
uv_stored_cols
(:,:)
#ifdef WITH_OPENMP
#if REALCASE == 1
real
(
kind
=
REAL_DATATYPE
),
allocatable
::
ur_p
(:,:),
uc_p
(:,:)
#endif
#if COMPLEXCASE == 1
complex
(
kind
=
COMPLEX_DATATYPE
),
allocatable
::
ur_p
(:,:),
uc_p
(:,:)
#endif
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
ur_p
(:,:),
uc_p
(:,:)
#endif
#if COMPLEXCASE == 1
real
(
kind
=
REAL_DATATYPE
),
allocatable
::
tmp_real
(:)
#endif
real
(
kind
=
rk
),
allocatable
::
tmp_real
(:)
integer
(
kind
=
ik
)
::
min_tile_size
,
error
integer
(
kind
=
ik
)
::
istat
character
(
200
)
::
errorMessage
...
...
@@ -738,34 +705,17 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
mpi_comm_rows
,
1
,
istep
-1
,
1
,
nblk
,
max_threads
)
! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
#if REALCASE == 1
x
=
0
if
(
l_cols
>
0
)
&
x
=
dot_product
(
v_col
(
1
:
l_cols
),
u_col
(
1
:
l_cols
))
#endif
#if COMPLEXCASE == 1
xc
=
0
if
(
l_cols
>
0
)
&
xc
=
dot_product
(
v_col
(
1
:
l_cols
),
u_col
(
1
:
l_cols
))
#endif
#ifdef WITH_MPI
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"mpi_communication"
)
#if REALCASE == 1
call
mpi_allreduce
(
x
,
vav
,
1
,
MPI_MATH_DATATYPE_PRECISION
,
MPI_SUM
,
mpi_comm_cols
,
mpierr
)
#endif
#if COMPLEXCASE == 1
call
mpi_allreduce
(
xc
,
vav
,
1
,
MPI_MATH_DATATYPE_PRECISION
,
MPI_SUM
,
mpi_comm_cols
,
mpierr
)
#endif
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#else /* WITH_MPI */
#if REALCASE == 1
vav
=
x
#endif
#if COMPLEXCASE == 1
vav
=
xc
#endif
#endif /* WITH_MPI */
...
...
@@ -977,12 +927,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
deallocate
(
tmp
,
v_row
,
u_row
,
v_col
,
u_col
,
vu_stored_rows
,
uv_stored_cols
,
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
#if REALCASE == 1
print
*
,
"tridiag_real: error when deallocating uv_stored_cols "
//
errorMessage
#endif
#if COMPLEXCASE == 1
print
*
,
"tridiag_complex: error when deallocating tmp "
//
errorMessage
#endif
print
*
,
"tridiag: error when deallocating "
//
errorMessage
stop
1
endif
...
...
@@ -1012,62 +957,31 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! distribute the arrays d_vec and e_vec to all processors
#if REALCASE == 1
allocate
(
tmp
(
na
),
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
print
*
,
"tridiag_real: error when allocating tmp "
//
errorMessage
stop
1
endif
#endif
#if COMPLEXCASE == 1
allocate
(
tmp_real
(
na
),
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
print
*
,
"tridiag
_complex
: error when allocating tmp_real "
//
errorMessage
print
*
,
"tridiag: error when allocating tmp_real "
//
errorMessage
stop
1
endif
#endif
#ifdef WITH_MPI
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"mpi_communication"
)
#if REALCASE == 1
tmp
=
d_vec
call
mpi_allreduce
(
tmp
,
d_vec
,
na
,
MPI_REAL_PRECISION
,
MPI_SUM
,
mpi_comm_rows
,
mpierr
)
tmp
=
d_vec
call
mpi_allreduce
(
tmp
,
d_vec
,
na
,
MPI_REAL_PRECISION
,
MPI_SUM
,
mpi_comm_cols
,
mpierr
)
tmp
=
e_vec
call
mpi_allreduce
(
tmp
,
e_vec
,
na
,
MPI_REAL_PRECISION
,
MPI_SUM
,
mpi_comm_rows
,
mpierr
)
tmp
=
e_vec
call
mpi_allreduce
(
tmp
,
e_vec
,
na
,
MPI_REAL_PRECISION
,
MPI_SUM
,
mpi_comm_cols
,
mpierr
)
#endif
#if COMPLEXCASE == 1
tmp_real
=
d_vec
call
mpi_allreduce
(
tmp_real
,
d_vec
,
na
,
MPI_REAL_PRECISION
,
MPI_SUM
,
mpi_comm_rows
,
mpierr
)
tmp_real
=
d_vec
call
mpi_allreduce
(
tmp_real
,
d_vec
,
na
,
MPI_REAL_PRECISION
,
MPI_SUM
,
mpi_comm_cols
,
mpierr
)
call
mpi_allreduce
(
tmp_real
,
d_vec
,
na
,
MPI_REAL_PRECISION
,
MPI_SUM
,
mpi_comm_cols
,
mpierr
)
tmp_real
=
e_vec
call
mpi_allreduce
(
tmp_real
,
e_vec
,
na
,
MPI_REAL_PRECISION
,
MPI_SUM
,
mpi_comm_rows
,
mpierr
)
tmp_real
=
e_vec
call
mpi_allreduce
(
tmp_real
,
e_vec
,
na
,
MPI_REAL_PRECISION
,
MPI_SUM
,
mpi_comm_cols
,
mpierr
)
#endif
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#endif /* WITH_MPI */
#if REALCASE == 1
deallocate
(
tmp
,
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
print
*
,
"tridiag_real: error when deallocating tmp "
//
errorMessage
stop
1
endif
#endif
#if COMPLEXCASE == 1
deallocate
(
tmp_real
,
stat
=
istat
,
errmsg
=
errorMessage
)
if
(
istat
.ne.
0
)
then
print
*
,
"tridiag
_complex
: error when deallocating tmp_real "
//
errorMessage
print
*
,
"tridiag: error when deallocating tmp_real "
//
errorMessage
stop
1
endif
#endif
call
obj
%
timer
%
stop
(
"tridiag_&
&MATH_DATATYPE&
...
...
src/elpa2/elpa2_bandred_template.F90
View file @
80951583
...
...
@@ -64,8 +64,8 @@
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
(
obj
,
na
,
a
,
a_dev
,
lda
,
nblk
,
nbw
,
matrixCols
,
numBlocks
,
mpi_comm_rows
,
mpi_comm_cols
,
tmat
,
&
tmat_dev
,
wantDebug
,
useGPU
,
success
,
&
(
obj
,
na
,
a
_mat
,
a_dev
,
lda
,
nblk
,
nbw
,
matrixCols
,
numBlocks
,
mpi_comm_rows
,
mpi_comm_cols
,
tmat
,
&
tmat_dev
,
wantDebug
,
useGPU
,
success
&
#if REALCASE == 1
useQR
,
&
#endif
...
...
@@ -78,14 +78,14 @@
!
! na Order of matrix
!
! a(lda,matrixCols) Distributed matrix which should be reduced.
! a
_mat
(lda,matrixCols) Distributed matrix which should be reduced.
! Distribution is like in Scalapack.
! Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
! a(:,:) is overwritten on exit with the band and the Householder vectors
! Opposed to Scalapack, a
_mat
(:,:) must be set completely (upper and lower half)
! a
_mat
(:,:) is overwritten on exit with the band and the Householder vectors
! in the upper half.
!
! lda Leading dimension of a
! matrixCols local columns of matrix a
! lda Leading dimension of a
_mat
! matrixCols local columns of matrix a
_mat
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
...
...
@@ -114,15 +114,16 @@
integer
(
kind
=
ik
)
::
na
,
lda
,
nblk
,
nbw
,
matrixCols
,
numBlocks
,
mpi_comm_rows
,
mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE
(
kind
=
rck
)
::
a
(
lda
,
*
),
tmat
(
nbw
,
nbw
,
*
)
MATH_DATATYPE
(
kind
=
rck
)
::
a
_mat
(
lda
,
*
),
tmat
(
nbw
,
nbw
,
*
)
#else
MATH_DATATYPE
(
kind
=
rck
)
::
a
(
lda
,
matrixCols
),
tmat
(
nbw
,
nbw
,
numBlocks
)
MATH_DATATYPE
(
kind
=
rck
)
::
a
_mat
(
lda
,
matrixCols
),
tmat
(
nbw
,
nbw
,
numBlocks
)
#endif
#if REALCASE == 1
real
(
kind
=
rk
)
::
eps
#endif
logical
,
intent
(
in
)
::
useGPU
character
(
20
)
::
gpuString
integer
(
kind
=
ik
)
::
my_prow
,
my_pcol
,
np_rows
,
np_cols
,
mpierr
integer
(
kind
=
ik
)
::
l_cols
,
l_rows
...
...
@@ -187,12 +188,18 @@
logical
::
useGPU_reduction_lower_block_to_tridiagonal
integer
(
kind
=
ik
),
intent
(
in
)
::
max_threads
if
(
useGPU
)
then
gpuString
=
"_gpu"
else
gpuString
=
""
endif
call
obj
%
timer
%
start
(
"bandred_&
&MATH_DATATYPE&
&"
//
&
&
PRECISION_SUFFIX
&
)
PRECISION_SUFFIX
//
&
gpuString
)
useGPU_reduction_lower_block_to_tridiagonal
=
.false.
if
(
useGPU
)
then
...
...
@@ -324,14 +331,14 @@
#ifdef USE_ASSUMED_SIZE_QR
call
qr_pdgeqrf_2dcomm_
&
&
PRECISION
&
&(
obj
,
a
,
lda
,
matrixCols
,
vmrCPU
,
max
(
l_rows
,
1
),
vmrCols
,
tauvector
(
1
),
na
,
tmat
(
1
,
1
,
1
),
&
&(
obj
,
a
_mat
,
lda
,
matrixCols
,
vmrCPU
,
max
(
l_rows
,
1
),
vmrCols
,
tauvector
(
1
),
na
,
tmat
(
1
,
1
,
1
),
&
nbw
,
nbw
,
dwork_size
,
1
,
-1
,
na
,
nbw
,
nblk
,
nblk
,
na
,
na
,
1
,
0
,
PQRPARAM
(
1
:
11
),
&
mpi_comm_rows
,
mpi_comm_cols
,
blockheuristic
)
#else
call
qr_pdgeqrf_2dcomm_
&
&
PRECISION
&
&(
obj
,
a
(
1
:
lda
,
1
:
matrixCols
),
matrixCols
,
lda
,
vmrCPU
(
1
:
max
(
l_rows
,
1
),
1
:
vmrCols
),
max
(
l_rows
,
1
),
&
&(
obj
,
a
_mat
(
1
:
lda
,
1
:
matrixCols
),
matrixCols
,
lda
,
vmrCPU
(
1
:
max
(
l_rows
,
1
),
1
:
vmrCols
),
max
(
l_rows
,
1
),
&
vmrCols
,
tauvector
(
1
:
na
),
na
,
tmat
(
1
:
nbw
,
1
:
nbw
,
1
),
nbw
,
&
nbw
,
dwork_size
(
1
:
1
),
1
,
-1
,
na
,
nbw
,
nblk
,
nblk
,
na
,
na
,
1
,
0
,
PQRPARAM
(
1
:
11
),
&
mpi_comm_rows
,
mpi_comm_cols
,
blockheuristic
)
...
...
@@ -360,7 +367,7 @@
cur_l_rows
=
0
cur_l_cols
=
0
successCUDA
=
cuda_memcpy
(
a_dev
,
loc
(
a
(
1
,
1
)),
(
lda
)
*
(
na_cols
)
*
size_of_datatype
,
cudaMemcpyHostToDevice
)
successCUDA
=
cuda_memcpy
(
a_dev
,
loc
(
a
_mat
(
1
,
1
)),
(
lda
)
*
(
na_cols
)
*
size_of_datatype
,
cudaMemcpyHostToDevice
)
if
(
.not.
(
successCUDA
))
then
print
*
,
"bandred_&
&MATH_DATATYPE&
...
...
@@ -538,7 +545,7 @@
cur_pcol
=
pcol
(
istep
*
nbw
+1
,
nblk
,
np_cols
)
if
(
my_pcol
==
cur_pcol
)
then
successCUDA
=
cuda_memcpy2d
(
loc
(
a
(
1
,
lc_start
)),
&
successCUDA
=
cuda_memcpy2d
(
loc
(
a
_mat
(
1
,
lc_start
)),
&
int
((
lda
*
size_of_datatype
),
kind
=
c_intptr_t
),
&
(
a_dev
+
int
(
(
(
lc_start
-1
)
*
lda
*
size_of_datatype
),
kind
=
c_intptr_t
)),
&
int
(
lda
*
size_of_datatype
,
kind
=
c_intptr_t
),
&
...
...
@@ -567,7 +574,7 @@
#ifdef USE_ASSUMED_SIZE_QR
call
qr_pdgeqrf_2dcomm_
&
&
PRECISION
&
&(
obj
,
a
,
lda
,
matrixCols
,
vmrCPU
,
max
(
l_rows
,
1
),
vmrCols
,
tauvector
(
1
),
&
&(
obj
,
a
_mat
,
lda
,
matrixCols
,
vmrCPU
,
max
(
l_rows
,
1
),
vmrCols
,
tauvector
(
1
),
&
na
,
tmat
(
1
,
1
,
istep
),
nbw
,
nbw
,
work_blocked
,
work_size
,
&
work_size
,
na
,
n_cols
,
nblk
,
nblk
,
&
istep
*
nbw
+
n_cols
-
nbw
,
istep
*
nbw
+
n_cols
,
1
,&
...
...
@@ -577,7 +584,7 @@
#else
call
qr_pdgeqrf_2dcomm_
&
&
PRECISION
&
&(
obj
,
a
(
1
:
lda
,
1
:
matrixCols
),
lda
,
matrixCols
,
vmrCPU
(
1
:
max
(
l_rows
,
1
),
1
:
vmrCols
)
,
&
&(
obj
,
a
_mat
(
1
:
lda
,
1
:
matrixCols
),
lda
,
matrixCols
,
vmrCPU
(
1
:
max
(
l_rows
,
1
),
1
:
vmrCols
)
,
&
max
(
l_rows
,
1
),
vmrCols
,
tauvector
(
1
:
na
),
na
,
&
tmat
(
1
:
nbw
,
1
:
nbw
,
istep
),
nbw
,
nbw
,
work_blocked
(
1
:
work_size
),
work_size
,
&
work_size
,
na
,
n_cols
,
nblk
,
nblk
,
&
...
...
@@ -608,7 +615,7 @@
! Get Vector to be transformed; distribute last element and norm of
! remaining elements to all procs in current column
vr
(
1
:
lr
)
=
a
(
1
:
lr
,
lch
)
! Vector to be transformed
vr
(
1
:
lr
)
=
a
_mat
(
1
:
lr
,
lch
)
! Vector to be transformed
if
(
my_prow
==
prow
(
nrow
,
nblk
,
np_rows
))
then
aux1
(
1
)
=
dot_product
(
vr
(
1
:
lr
-1
),
vr
(
1
:
lr
-1
))
...
...
@@ -646,11 +653,11 @@
vr
(
1
:
lr
)
=
vr
(
1
:
lr
)
*
xf
if
(
my_prow
==
prow
(
nrow
,
nblk
,
np_rows
))
then
a
(
1
:
lr
-1
,
lch
)
=
vr
(
1
:
lr
-1
)
a
(
lr
,
lch
)
=
vrl
a
_mat
(
1
:
lr
-1
,
lch
)
=
vr
(
1
:
lr
-1
)
a
_mat
(
lr
,
lch
)
=
vrl
vr
(
lr
)
=
1.0_rck
else
a
(
1
:
lr
,
lch
)
=
vr
(
1
:
lr
)
a
_mat
(
1
:
lr
,
lch
)
=
vr
(
1
:
lr
)
endif
endif
...
...
@@ -692,7 +699,7 @@
lcx
=
local_index
(
istep
*
nbw
+
j
,
my_pcol
,
np_cols
,
nblk
,
0
)
if
(
lcx
>
0
)
then
nlc
=
nlc
+1
aux1
(
nlc
)
=
dot_product
(
vr
(
1
:
lr
),
a
(
1
:
lr
,
lcx
))
aux1
(
nlc
)
=
dot_product
(
vr
(
1
:
lr
),
a
_mat
(
1
:
lr
,
lcx
))
endif
enddo
...
...
@@ -708,7 +715,7 @@
lcx
=
local_index
(
istep
*
nbw
+
j
,
my_pcol
,
np_cols
,
nblk
,
0
)
if
(
lcx
>
0
)
then
nlc
=
nlc
+1
a
(
1
:
lr
,
lcx
)
=
a
(
1
:
lr
,
lcx
)
-
conjg
(
tau
)
*
aux2
(
nlc
)
*
vr
(
1
:
lr
)
a
_mat
(
1
:
lr
,
lcx
)
=
a
_mat
(
1
:
lr
,
lcx
)
-
conjg
(
tau
)
*
aux2
(
nlc
)
*
vr
(
1
:
lr
)
endif
enddo
...
...
@@ -726,7 +733,7 @@
lcx
=
local_index
(
istep
*
nbw
+
j
,
my_pcol
,
np_cols
,
nblk
,
0
)
if
(
lcx
>
0
)
then
nlc
=
nlc
+1
a
(
1
:
lr
,
lcx
)
=
a
(
1
:
lr
,
lcx
)
-
conjg
(
tau
)
*
aux1
(
nlc
)
*
vr
(
1
:
lr
)
a
_mat
(
1
:
lr
,
lcx
)
=
a
_mat
(
1
:
lr
,
lcx
)
-
conjg
(
tau
)
*
aux1
(
nlc
)
*
vr
(
1
:
lr
)
endif
enddo
...
...
@@ -739,7 +746,7 @@
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
! a
_mat
(1:lr,lcx) = a
_mat
(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
! endif
! enddo
...
...
@@ -762,7 +769,7 @@
if
(
lcx
>
0
)
then
mynlc
=
mynlc
+1
if
(
mod
((
j
-1
),
omp_get_num_threads
())
.eq.
omp_get_thread_num
()
)
then
if
(
lr
>
0
)
aux1
(
mynlc
)
=
dot_product
(
vr
(
1
:
lr
),
a
(
1
:
lr
,
lcx
))
if
(
lr
>
0
)
aux1
(
mynlc
)
=
dot_product
(
vr
(
1
:
lr
),
a
_mat
(
1
:
lr
,
lcx
))
endif