Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
830b54aa
Commit
830b54aa
authored
May 03, 2018
by
Pavel Kus
Browse files
renaming matrix a to a_mat
parent
68e4bfa8
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
src/elpa2/elpa2_bandred_template.F90
View file @
830b54aa
...
...
@@ -64,7 +64,7 @@
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
(
obj
,
na
,
a
,
a_dev
,
lda
,
nblk
,
nbw
,
matrixCols
,
numBlocks
,
mpi_comm_rows
,
mpi_comm_cols
,
tmat
,
&
(
obj
,
na
,
a
_mat
,
a_dev
,
lda
,
nblk
,
nbw
,
matrixCols
,
numBlocks
,
mpi_comm_rows
,
mpi_comm_cols
,
tmat
,
&
tmat_dev
,
wantDebug
,
useGPU
,
success
&
#if REALCASE == 1
,
useQR
)
...
...
@@ -80,14 +80,14 @@
!
! na Order of matrix
!
! a(lda,matrixCols) Distributed matrix which should be reduced.
! a
_mat
(lda,matrixCols) Distributed matrix which should be reduced.
! Distribution is like in Scalapack.
! Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
! a(:,:) is overwritten on exit with the band and the Householder vectors
! Opposed to Scalapack, a
_mat
(:,:) must be set completely (upper and lower half)
! a
_mat
(:,:) is overwritten on exit with the band and the Householder vectors
! in the upper half.
!
! lda Leading dimension of a
! matrixCols local columns of matrix a
! lda Leading dimension of a
_mat
! matrixCols local columns of matrix a
_mat
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
...
...
@@ -116,9 +116,9 @@
integer
(
kind
=
ik
)
::
na
,
lda
,
nblk
,
nbw
,
matrixCols
,
numBlocks
,
mpi_comm_rows
,
mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE
(
kind
=
rck
)
::
a
(
lda
,
*
),
tmat
(
nbw
,
nbw
,
*
)
MATH_DATATYPE
(
kind
=
rck
)
::
a
_mat
(
lda
,
*
),
tmat
(
nbw
,
nbw
,
*
)
#else
MATH_DATATYPE
(
kind
=
rck
)
::
a
(
lda
,
matrixCols
),
tmat
(
nbw
,
nbw
,
numBlocks
)
MATH_DATATYPE
(
kind
=
rck
)
::
a
_mat
(
lda
,
matrixCols
),
tmat
(
nbw
,
nbw
,
numBlocks
)
#endif
#if REALCASE == 1
...
...
@@ -325,14 +325,14 @@
#ifdef USE_ASSUMED_SIZE_QR
call
qr_pdgeqrf_2dcomm_
&
&
PRECISION
&
&(
obj
,
a
,
lda
,
matrixCols
,
vmrCPU
,
max
(
l_rows
,
1
),
vmrCols
,
tauvector
(
1
),
na
,
tmat
(
1
,
1
,
1
),
&
&(
obj
,
a
_mat
,
lda
,
matrixCols
,
vmrCPU
,
max
(
l_rows
,
1
),
vmrCols
,
tauvector
(
1
),
na
,
tmat
(
1
,
1
,
1
),
&
nbw
,
nbw
,
dwork_size
,
1
,
-1
,
na
,
nbw
,
nblk
,
nblk
,
na
,
na
,
1
,
0
,
PQRPARAM
(
1
:
11
),
&
mpi_comm_rows
,
mpi_comm_cols
,
blockheuristic
)
#else
call
qr_pdgeqrf_2dcomm_
&
&
PRECISION
&
&(
obj
,
a
(
1
:
lda
,
1
:
matrixCols
),
matrixCols
,
lda
,
vmrCPU
(
1
:
max
(
l_rows
,
1
),
1
:
vmrCols
),
max
(
l_rows
,
1
),
&
&(
obj
,
a
_mat
(
1
:
lda
,
1
:
matrixCols
),
matrixCols
,
lda
,
vmrCPU
(
1
:
max
(
l_rows
,
1
),
1
:
vmrCols
),
max
(
l_rows
,
1
),
&
vmrCols
,
tauvector
(
1
:
na
),
na
,
tmat
(
1
:
nbw
,
1
:
nbw
,
1
),
nbw
,
&
nbw
,
dwork_size
(
1
:
1
),
1
,
-1
,
na
,
nbw
,
nblk
,
nblk
,
na
,
na
,
1
,
0
,
PQRPARAM
(
1
:
11
),
&
mpi_comm_rows
,
mpi_comm_cols
,
blockheuristic
)
...
...
@@ -361,7 +361,7 @@
cur_l_rows
=
0
cur_l_cols
=
0
successCUDA
=
cuda_memcpy
(
a_dev
,
loc
(
a
(
1
,
1
)),
(
lda
)
*
(
na_cols
)
*
size_of_datatype
,
cudaMemcpyHostToDevice
)
successCUDA
=
cuda_memcpy
(
a_dev
,
loc
(
a
_mat
(
1
,
1
)),
(
lda
)
*
(
na_cols
)
*
size_of_datatype
,
cudaMemcpyHostToDevice
)
if
(
.not.
(
successCUDA
))
then
print
*
,
"bandred_&
&MATH_DATATYPE&
...
...
@@ -539,7 +539,7 @@
cur_pcol
=
pcol
(
istep
*
nbw
+1
,
nblk
,
np_cols
)
if
(
my_pcol
==
cur_pcol
)
then
successCUDA
=
cuda_memcpy2d
(
loc
(
a
(
1
,
lc_start
)),
&
successCUDA
=
cuda_memcpy2d
(
loc
(
a
_mat
(
1
,
lc_start
)),
&
int
((
lda
*
size_of_datatype
),
kind
=
c_intptr_t
),
&
(
a_dev
+
int
(
(
(
lc_start
-1
)
*
lda
*
size_of_datatype
),
kind
=
c_intptr_t
)),
&
int
(
lda
*
size_of_datatype
,
kind
=
c_intptr_t
),
&
...
...
@@ -568,7 +568,7 @@
#ifdef USE_ASSUMED_SIZE_QR
call
qr_pdgeqrf_2dcomm_
&
&
PRECISION
&
&(
obj
,
a
,
lda
,
matrixCols
,
vmrCPU
,
max
(
l_rows
,
1
),
vmrCols
,
tauvector
(
1
),
&
&(
obj
,
a
_mat
,
lda
,
matrixCols
,
vmrCPU
,
max
(
l_rows
,
1
),
vmrCols
,
tauvector
(
1
),
&
na
,
tmat
(
1
,
1
,
istep
),
nbw
,
nbw
,
work_blocked
,
work_size
,
&
work_size
,
na
,
n_cols
,
nblk
,
nblk
,
&
istep
*
nbw
+
n_cols
-
nbw
,
istep
*
nbw
+
n_cols
,
1
,&
...
...
@@ -578,7 +578,7 @@
#else
call
qr_pdgeqrf_2dcomm_
&
&
PRECISION
&
&(
obj
,
a
(
1
:
lda
,
1
:
matrixCols
),
lda
,
matrixCols
,
vmrCPU
(
1
:
max
(
l_rows
,
1
),
1
:
vmrCols
)
,
&
&(
obj
,
a
_mat
(
1
:
lda
,
1
:
matrixCols
),
lda
,
matrixCols
,
vmrCPU
(
1
:
max
(
l_rows
,
1
),
1
:
vmrCols
)
,
&
max
(
l_rows
,
1
),
vmrCols
,
tauvector
(
1
:
na
),
na
,
&
tmat
(
1
:
nbw
,
1
:
nbw
,
istep
),
nbw
,
nbw
,
work_blocked
(
1
:
work_size
),
work_size
,
&
work_size
,
na
,
n_cols
,
nblk
,
nblk
,
&
...
...
@@ -609,7 +609,7 @@
! Get Vector to be transformed; distribute last element and norm of
! remaining elements to all procs in current column
vr
(
1
:
lr
)
=
a
(
1
:
lr
,
lch
)
! Vector to be transformed
vr
(
1
:
lr
)
=
a
_mat
(
1
:
lr
,
lch
)
! Vector to be transformed
if
(
my_prow
==
prow
(
nrow
,
nblk
,
np_rows
))
then
aux1
(
1
)
=
dot_product
(
vr
(
1
:
lr
-1
),
vr
(
1
:
lr
-1
))
...
...
@@ -647,11 +647,11 @@
vr
(
1
:
lr
)
=
vr
(
1
:
lr
)
*
xf
if
(
my_prow
==
prow
(
nrow
,
nblk
,
np_rows
))
then
a
(
1
:
lr
-1
,
lch
)
=
vr
(
1
:
lr
-1
)
a
(
lr
,
lch
)
=
vrl
a
_mat
(
1
:
lr
-1
,
lch
)
=
vr
(
1
:
lr
-1
)
a
_mat
(
lr
,
lch
)
=
vrl
vr
(
lr
)
=
1.0_rck
else
a
(
1
:
lr
,
lch
)
=
vr
(
1
:
lr
)
a
_mat
(
1
:
lr
,
lch
)
=
vr
(
1
:
lr
)
endif
endif
...
...
@@ -693,7 +693,7 @@
lcx
=
local_index
(
istep
*
nbw
+
j
,
my_pcol
,
np_cols
,
nblk
,
0
)
if
(
lcx
>
0
)
then
nlc
=
nlc
+1
aux1
(
nlc
)
=
dot_product
(
vr
(
1
:
lr
),
a
(
1
:
lr
,
lcx
))
aux1
(
nlc
)
=
dot_product
(
vr
(
1
:
lr
),
a
_mat
(
1
:
lr
,
lcx
))
endif
enddo
...
...
@@ -709,7 +709,7 @@
lcx
=
local_index
(
istep
*
nbw
+
j
,
my_pcol
,
np_cols
,
nblk
,
0
)
if
(
lcx
>
0
)
then
nlc
=
nlc
+1
a
(
1
:
lr
,
lcx
)
=
a
(
1
:
lr
,
lcx
)
-
conjg
(
tau
)
*
aux2
(
nlc
)
*
vr
(
1
:
lr
)
a
_mat
(
1
:
lr
,
lcx
)
=
a
_mat
(
1
:
lr
,
lcx
)
-
conjg
(
tau
)
*
aux2
(
nlc
)
*
vr
(
1
:
lr
)
endif
enddo
...
...
@@ -727,7 +727,7 @@
lcx
=
local_index
(
istep
*
nbw
+
j
,
my_pcol
,
np_cols
,
nblk
,
0
)
if
(
lcx
>
0
)
then
nlc
=
nlc
+1
a
(
1
:
lr
,
lcx
)
=
a
(
1
:
lr
,
lcx
)
-
conjg
(
tau
)
*
aux1
(
nlc
)
*
vr
(
1
:
lr
)
a
_mat
(
1
:
lr
,
lcx
)
=
a
_mat
(
1
:
lr
,
lcx
)
-
conjg
(
tau
)
*
aux1
(
nlc
)
*
vr
(
1
:
lr
)
endif
enddo
...
...
@@ -740,7 +740,7 @@
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
! a
_mat
(1:lr,lcx) = a
_mat
(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
! endif
! enddo
...
...
@@ -763,7 +763,7 @@
if
(
lcx
>
0
)
then
mynlc
=
mynlc
+1
if
(
mod
((
j
-1
),
omp_get_num_threads
())
.eq.
omp_get_thread_num
()
)
then
if
(
lr
>
0
)
aux1
(
mynlc
)
=
dot_product
(
vr
(
1
:
lr
),
a
(
1
:
lr
,
lcx
))
if
(
lr
>
0
)
aux1
(
mynlc
)
=
dot_product
(
vr
(
1
:
lr
),
a
_mat
(
1
:
lr
,
lcx
))
endif
endif
enddo
...
...
@@ -796,10 +796,10 @@
do
pp
=
1
,
transformChunkSize
if
(
pp
+
ii
>
lr
)
exit
#if REALCASE == 1
a
(
ii
+
pp
,
lcx
)
=
a
(
ii
+
pp
,
lcx
)
-
tau
*
aux2
(
mynlc
)
*
vr
(
ii
+
pp
)
a
_mat
(
ii
+
pp
,
lcx
)
=
a
_mat
(
ii
+
pp
,
lcx
)
-
tau
*
aux2
(
mynlc
)
*
vr
(
ii
+
pp
)
#endif
#if COMPLEXCASE == 1
a
(
ii
+
pp
,
lcx
)
=
a
(
ii
+
pp
,
lcx
)
-
conjg
(
tau
)
*
aux2
(
mynlc
)
*
vr
(
ii
+
pp
)
a
_mat
(
ii
+
pp
,
lcx
)
=
a
_mat
(
ii
+
pp
,
lcx
)
-
conjg
(
tau
)
*
aux2
(
mynlc
)
*
vr
(
ii
+
pp
)
#endif
enddo
enddo
...
...
@@ -814,7 +814,7 @@
lcx
=
local_index
(
istep
*
nbw
+
j
,
my_pcol
,
np_cols
,
nblk
,
0
)
if
(
lcx
>
0
)
then
nlc
=
nlc
+1
if
(
lr
>
0
)
aux1
(
nlc
)
=
dot_product
(
vr
(
1
:
lr
),
a
(
1
:
lr
,
lcx
))
if
(
lr
>
0
)
aux1
(
nlc
)
=
dot_product
(
vr
(
1
:
lr
),
a
_mat
(
1
:
lr
,
lcx
))
endif
enddo
...
...
@@ -835,10 +835,10 @@
if
(
lcx
>
0
)
then
nlc
=
nlc
+1
#if REALCASE == 1
a
(
1
:
lr
,
lcx
)
=
a
(
1
:
lr
,
lcx
)
-
tau
*
aux2
(
nlc
)
*
vr
(
1
:
lr
)
a
_mat
(
1
:
lr
,
lcx
)
=
a
_mat
(
1
:
lr
,
lcx
)
-
tau
*
aux2
(
nlc
)
*
vr
(
1
:
lr
)
#endif
#if COMPLEXCASE == 1
a
(
1
:
lr
,
lcx
)
=
a
(
1
:
lr
,
lcx
)
-
conjg
(
tau
)
*
aux2
(
nlc
)
*
vr
(
1
:
lr
)
a
_mat
(
1
:
lr
,
lcx
)
=
a
_mat
(
1
:
lr
,
lcx
)
-
conjg
(
tau
)
*
aux2
(
nlc
)
*
vr
(
1
:
lr
)
#endif
endif
enddo
...
...
@@ -851,7 +851,7 @@
if
(
my_pcol
==
cur_pcol
)
then
successCUDA
=
cuda_memcpy2d
((
a_dev
+
&
int
(((
lc_start
-1
)
*
lda
*
size_of_datatype
),
kind
=
c_intptr_t
)),
&
int
(
lda
*
size_of_datatype
,
kind
=
c_intptr_t
),
loc
(
a
(
1
,
lc_start
)),
&
int
(
lda
*
size_of_datatype
,
kind
=
c_intptr_t
),
loc
(
a
_mat
(
1
,
lc_start
)),
&
int
(
lda
*
size_of_datatype
,
kind
=
c_intptr_t
),
&
int
(
lr_end
*
size_of_datatype
,
kind
=
c_intptr_t
),
&
int
((
lc_end
-
lc_start
+1
),
kind
=
c_intptr_t
),
&
...
...
@@ -932,7 +932,7 @@
if
(
my_pcol
==
cur_pcol
)
then
successCUDA
=
cuda_memcpy2d
((
a_dev
+
&
int
(((
lc_start
-1
)
*
lda
*
size_of_datatype
),
kind
=
c_intptr_t
)),
&
int
(
lda
*
size_of_datatype
,
kind
=
c_intptr_t
),
loc
(
a
(
1
,
lc_start
)),
&
int
(
lda
*
size_of_datatype
,
kind
=
c_intptr_t
),
loc
(
a
_mat
(
1
,
lc_start
)),
&
int
(
lda
*
size_of_datatype
,
kind
=
c_intptr_t
),
&
int
(
lr_end
*
size_of_datatype
,
kind
=
c_intptr_t
),
&
int
((
lc_end
-
lc_start
+1
),
kind
=
c_intptr_t
),
&
...
...
@@ -989,14 +989,14 @@
lre
=
min
(
l_rows
,(
i
+1
)
*
l_rows_tile
)
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMM
(
'C'
,
'N'
,
lce
-
lcs
+1
,
n_cols
,
lre
,
ONE
,
a
(
1
,
lcs
),
ubound
(
a
,
dim
=
1
),
&
call
PRECISION_GEMM
(
'C'
,
'N'
,
lce
-
lcs
+1
,
n_cols
,
lre
,
ONE
,
a
_mat
(
1
,
lcs
),
ubound
(
a
_mat
,
dim
=
1
),
&
vmrCPU
,
ubound
(
vmrCPU
,
dim
=
1
),
ONE
,
umcCPU
(
lcs
,
1
),
ubound
(
umcCPU
,
dim
=
1
))
call
obj
%
timer
%
stop
(
"blas"
)
if
(
i
==
0
)
cycle
lre
=
min
(
l_rows
,
i
*
l_rows_tile
)
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMM
(
'N'
,
'N'
,
lre
,
n_cols
,
lce
-
lcs
+1
,
ONE
,
a
(
1
,
lcs
),
lda
,
&
call
PRECISION_GEMM
(
'N'
,
'N'
,
lre
,
n_cols
,
lce
-
lcs
+1
,
ONE
,
a
_mat
(
1
,
lcs
),
lda
,
&
umcCPU
(
lcs
,
n_cols
+1
),
ubound
(
umcCPU
,
dim
=
1
),
ONE
,
vmrCPU
(
1
,
n_cols
+1
),
ubound
(
vmrCPU
,
dim
=
1
))
call
obj
%
timer
%
stop
(
"blas"
)
enddo
...
...
@@ -1061,7 +1061,7 @@
if
(
lre
>
lrs
.and.
l_cols
>
lcs
)
then
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMM
(
'N'
,
'N'
,
lre
-
lrs
+1
,
n_cols
,
l_cols
-
lcs
+1
,
&
ONE
,
a
(
lrs
,
lcs
),
ubound
(
a
,
dim
=
1
),
&
ONE
,
a
_mat
(
lrs
,
lcs
),
ubound
(
a
_mat
,
dim
=
1
),
&
umcCPU
(
lcs
,
n_cols
+1
),
ubound
(
umcCPU
,
dim
=
1
),
&
ZERO
,
vmrCPU
(
lrs
,
n_cols
+1
),
ubound
(
vmrCPU
,
dim
=
1
))
call
obj
%
timer
%
stop
(
"blas"
)
...
...
@@ -1072,7 +1072,7 @@
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMM
(
BLAS_TRANS_OR_CONJ
,
'N'
,
&
lce
-
lcs
+1
,
n_cols
,
lrs
-1
,
&
ONE
,
a
(
1
,
lcs
),
ubound
(
a
,
dim
=
1
),
&
ONE
,
a
_mat
(
1
,
lcs
),
ubound
(
a
_mat
,
dim
=
1
),
&
vmrCPU
(
1
,
1
),
ubound
(
vmrCPU
,
dim
=
1
),
&
ZERO
,
umcCPU
(
lcs
,
1
),
ubound
(
umcCPU
,
dim
=
1
))
call
obj
%
timer
%
stop
(
"blas"
)
...
...
@@ -1152,13 +1152,13 @@
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMM
(
BLAS_TRANS_OR_CONJ
,
'N'
,
&
lce
-
lcs
+1
,
n_cols
,
lre
,
ONE
,
a
(
1
,
lcs
),
ubound
(
a
,
dim
=
1
),
&
lce
-
lcs
+1
,
n_cols
,
lre
,
ONE
,
a
_mat
(
1
,
lcs
),
ubound
(
a
_mat
,
dim
=
1
),
&
vmrCPU
,
ubound
(
vmrCPU
,
dim
=
1
),
ONE
,
umcCPU
(
lcs
,
1
),
ubound
(
umcCPU
,
dim
=
1
))
call
obj
%
timer
%
stop
(
"blas"
)
if
(
i
==
0
)
cycle
lre
=
min
(
l_rows
,
i
*
l_rows_tile
)
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMM
(
'N'
,
'N'
,
lre
,
n_cols
,
lce
-
lcs
+1
,
ONE
,
a
(
1
,
lcs
),
lda
,
&
call
PRECISION_GEMM
(
'N'
,
'N'
,
lre
,
n_cols
,
lce
-
lcs
+1
,
ONE
,
a
_mat
(
1
,
lcs
),
lda
,
&
umcCPU
(
lcs
,
n_cols
+1
),
ubound
(
umcCPU
,
dim
=
1
),
ONE
,
&
vmrCPU
(
1
,
n_cols
+1
),
ubound
(
vmrCPU
,
dim
=
1
))
call
obj
%
timer
%
stop
(
"blas"
)
...
...
@@ -1494,7 +1494,7 @@
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMM
(
'N'
,
BLAS_TRANS_OR_CONJ
,
myend
-
mystart
+1
,
lce
-
lcs
+1
,
2
*
n_cols
,
-
ONE
,
&
vmrCPU
(
mystart
,
1
),
ubound
(
vmrCPU
,
1
),
umcCPU
(
lcs
,
1
),
ubound
(
umcCPU
,
1
),
&
ONE
,
a
(
mystart
,
lcs
),
ubound
(
a
,
1
))
ONE
,
a
_mat
(
mystart
,
lcs
),
ubound
(
a
_mat
,
1
))
call
obj
%
timer
%
stop
(
"blas"
)
enddo
!$omp end parallel
...
...
@@ -1507,7 +1507,7 @@
! call obj%timer%start("blas")
! call PRECISION_GEMM('N', 'C', lre,lce-lcs+1, 2*n_cols, -ONE, &
! vmrCPU, ubound(vmrCPU,dim=1), umcCPU(lcs,1), ubound(umcCPU,dim=1), &
! ONE, a(1,lcs), lda)
! ONE, a
_mat
(1,lcs), lda)
! call obj%timer%stop("blas")
! enddo
!#endif
...
...
@@ -1536,7 +1536,7 @@
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMM
(
'N'
,
BLAS_TRANS_OR_CONJ
,
lre
,
lce
-
lcs
+1
,
2
*
n_cols
,
-
ONE
,
&
vmrCPU
,
ubound
(
vmrCPU
,
dim
=
1
),
umcCPU
(
lcs
,
1
),
ubound
(
umcCPU
,
dim
=
1
),
&
ONE
,
a
(
1
,
lcs
),
lda
)
ONE
,
a
_mat
(
1
,
lcs
),
lda
)
call
obj
%
timer
%
stop
(
"blas"
)
endif
! useGPU
enddo
! i=0,(istep*nbw-1)/tile_size
...
...
src/elpa2/elpa2_tridiag_band_template.F90
View file @
830b54aa
...
...
@@ -55,7 +55,7 @@
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
(
obj
,
na
,
nb
,
nblk
,
a
Matrix
,
a_dev
,
lda
,
d
,
e
,
matrixCols
,
&
(
obj
,
na
,
nb
,
nblk
,
a
_mat
,
a_dev
,
lda
,
d
,
e
,
matrixCols
,
&
hh_trans
,
mpi_comm_rows
,
mpi_comm_cols
,
communicator
,
useGPU
,
wantDebug
)
!-------------------------------------------------------------------------------
! tridiag_band_real/complex:
...
...
@@ -67,7 +67,7 @@
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! a
Matrix
(lda,matrixCols) Distributed system matrix reduced to banded form in the upper diagonal
! a
_mat
(lda,matrixCols) Distributed system matrix reduced to banded form in the upper diagonal
!
! lda Leading dimension of a
! matrixCols local columns of matrix a
...
...
@@ -95,9 +95,9 @@
logical
,
intent
(
in
)
::
useGPU
,
wantDebug
integer
(
kind
=
ik
),
intent
(
in
)
::
na
,
nb
,
nblk
,
lda
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
communicator
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE
(
kind
=
rck
),
intent
(
in
)
::
a
Matrix
(
lda
,
*
)
MATH_DATATYPE
(
kind
=
rck
),
intent
(
in
)
::
a
_mat
(
lda
,
*
)
#else
MATH_DATATYPE
(
kind
=
rck
),
intent
(
in
)
::
a
Matrix
(
lda
,
matrixCols
)
MATH_DATATYPE
(
kind
=
rck
),
intent
(
in
)
::
a
_mat
(
lda
,
matrixCols
)
#endif
integer
(
kind
=
c_intptr_t
)
::
a_dev
real
(
kind
=
rk
),
intent
(
out
)
::
d
(
na
),
e
(
na
)
! set only on PE 0
...
...
@@ -228,7 +228,7 @@
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
&(
obj
,
a
Matrix
,
a_dev
,
lda
,
na
,
nblk
,
nb
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
communicator
,
ab
,
useGPU
)
&(
obj
,
a
_mat
,
a_dev
,
lda
,
na
,
nblk
,
nb
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
communicator
,
ab
,
useGPU
)
! Calculate the workload for each sweep in the back transformation
! and the space requirements to hold the HH vectors
...
...
src/elpa2/redist_band.F90
View file @
830b54aa
...
...
@@ -51,7 +51,7 @@ subroutine redist_band_&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
(
obj
,
a
,
a_dev
,
lda
,
na
,
nblk
,
nbw
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
communicator
,
ab
,
useGPU
)
(
obj
,
a
_mat
,
a_dev
,
lda
,
na
,
nblk
,
nbw
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
communicator
,
ab
,
useGPU
)
use
elpa_abstract_impl
use
elpa2_workload
...
...
@@ -65,7 +65,7 @@ subroutine redist_band_&
class
(
elpa_abstract_impl_t
),
intent
(
inout
)
::
obj
logical
,
intent
(
in
)
::
useGPU
integer
(
kind
=
ik
),
intent
(
in
)
::
lda
,
na
,
nblk
,
nbw
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
,
communicator
MATH_DATATYPE
(
kind
=
C_DATATYPE_KIND
),
intent
(
in
)
::
a
(
lda
,
matrixCols
)
MATH_DATATYPE
(
kind
=
C_DATATYPE_KIND
),
intent
(
in
)
::
a
_mat
(
lda
,
matrixCols
)
MATH_DATATYPE
(
kind
=
C_DATATYPE_KIND
),
intent
(
out
)
::
ab
(:,:)
integer
(
kind
=
ik
),
allocatable
::
ncnt_s
(:),
nstart_s
(:),
ncnt_r
(:),
nstart_r
(:),
&
...
...
@@ -91,7 +91,7 @@ subroutine redist_band_&
if
(
useGPU
)
then
! copy a_dev to aMatrix
successCUDA
=
cuda_memcpy
(
loc
(
a
),
int
(
a_dev
,
kind
=
c_intptr_t
),
int
(
lda
*
matrixCols
*
size_of_datatype
,
kind
=
c_intptr_t
),
&
successCUDA
=
cuda_memcpy
(
loc
(
a
_mat
),
int
(
a_dev
,
kind
=
c_intptr_t
),
int
(
lda
*
matrixCols
*
size_of_datatype
,
kind
=
c_intptr_t
),
&
cudaMemcpyDeviceToHost
)
if
(
.not.
(
successCUDA
))
then
print
*
,
"redist_band_&
...
...
@@ -175,8 +175,8 @@ subroutine redist_band_&
! Fill send buffer
l_rows
=
local_index
(
na
,
my_prow
,
np_rows
,
nblk
,
-1
)
! Local rows of a
l_cols
=
local_index
(
na
,
my_pcol
,
np_cols
,
nblk
,
-1
)
! Local columns of a
l_rows
=
local_index
(
na
,
my_prow
,
np_rows
,
nblk
,
-1
)
! Local rows of a
_mat
l_cols
=
local_index
(
na
,
my_pcol
,
np_cols
,
nblk
,
-1
)
! Local columns of a
_mat
np
=
0
do
j
=
0
,(
na
-1
)/
nblk
! loop over rows of blocks
...
...
@@ -190,7 +190,7 @@ subroutine redist_band_&
jl
=
MIN
(
nblk
,
l_rows
-
js
)
il
=
MIN
(
nblk
,
l_cols
-
is
)
sbuf
(
1
:
jl
,
1
:
il
,
nstart_s
(
np
))
=
a
(
js
+1
:
js
+
jl
,
is
+1
:
is
+
il
)
sbuf
(
1
:
jl
,
1
:
il
,
nstart_s
(
np
))
=
a
_mat
(
js
+1
:
js
+
jl
,
is
+1
:
is
+
il
)
endif
enddo
endif
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment