Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
85b2b68c
Commit
85b2b68c
authored
Feb 18, 2013
by
Alexander Heinecke
Browse files
removed multiple blocking in band_to_full
parent
13ef90f3
Changes
2
Expand all
Hide whitespace changes
Inline
Side-by-side
ELPA_2013.02.BETA/src/elpa2.f90
View file @
85b2b68c
...
...
@@ -684,9 +684,9 @@ subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq,
integer
l_cols
,
l_rows
,
l_colh
,
n_cols
integer
istep
,
lc
,
ncol
,
nrow
,
nb
,
ns
real
*
8
,
allocatable
::
tmp1
(:),
tmp2
(:),
hvb
(:),
hvm
(:,:)
,
tmat_complete
(:,:),
t_tmp
(:,:),
t_tmp2
(:,:)
real
*
8
,
allocatable
::
tmp1
(:),
tmp2
(:),
hvb
(:),
hvm
(:,:)
integer
pcol
,
prow
,
i
,
cwy_blocking
,
t_blocking
,
t_cols
,
t_rows
integer
pcol
,
prow
,
i
pcol
(
i
)
=
MOD
((
i
-1
)/
nblk
,
np_cols
)
!Processor col for global col number
prow
(
i
)
=
MOD
((
i
-1
)/
nblk
,
np_rows
)
!Processor row for global row number
...
...
@@ -696,32 +696,25 @@ subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq,
call
mpi_comm_rank
(
mpi_comm_cols
,
my_pcol
,
mpierr
)
call
mpi_comm_size
(
mpi_comm_cols
,
np_cols
,
mpierr
)
max_blocks_row
=
((
na
-1
)/
nblk
)/
np_rows
+
1
! Rows of A
max_blocks_col
=
((
nqc
-1
)/
nblk
)/
np_cols
+
1
! Columns of q!
max_local_rows
=
max_blocks_row
*
nblk
max_local_cols
=
max_blocks_col
*
nblk
t_blocking
=
2
! number of matrices T (tmat) which are aggregated into a new (larger) T matrix (tmat_complete) and applied at once
cwy_blocking
=
t_blocking
*
nbw
allocate
(
tmp1
(
max_local_cols
*
cwy_blocking
))
allocate
(
tmp2
(
max_local_cols
*
cwy_blocking
))
allocate
(
hvb
(
max_local_rows
*
cwy_blocking
))
allocate
(
hvm
(
max_local_rows
,
cwy_blocking
))
allocate
(
tmat_complete
(
cwy_blocking
,
cwy_blocking
))
allocate
(
t_tmp
(
cwy_blocking
,
nbw
))
allocate
(
t_tmp2
(
cwy_blocking
,
nbw
))
allocate
(
tmp1
(
max_local_cols
*
nbw
))
allocate
(
tmp2
(
max_local_cols
*
nbw
))
allocate
(
hvb
(
max_local_rows
*
nbw
))
allocate
(
hvm
(
max_local_rows
,
nbw
))
hvm
=
0
! Must be set to 0 !!!
hvb
=
0
! Safety only
l_cols
=
local_index
(
nqc
,
my_pcol
,
np_cols
,
nblk
,
-1
)
! Local columns of q
do
istep
=
1
,(
(
na
-1
)/
nbw
-1
)/
t_blocking
+
1
do
istep
=
1
,(
na
-1
)/
nbw
n_cols
=
MIN
(
na
,
istep
*
cwy_blocking
+
nbw
)
-
(
istep
-1
)
*
cwy_blocking
-
nbw
! Number of columns in current step
n_cols
=
MIN
(
na
,
(
istep
+1
)
*
nbw
)
-
istep
*
nbw
! Number of columns in current step
! Broadcast all Householder vectors for current step compressed in hvb
...
...
@@ -729,7 +722,7 @@ subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq,
ns
=
0
do
lc
=
1
,
n_cols
ncol
=
(
istep
-1
)
*
cwy_blocking
+
nbw
+
lc
! absolute column number of householder vector
ncol
=
istep
*
nbw
+
lc
! absolute column number of householder vector
nrow
=
ncol
-
nbw
! absolute number of pivot row
l_rows
=
local_index
(
nrow
-1
,
my_prow
,
np_rows
,
nblk
,
-1
)
! row length for bcast
...
...
@@ -749,7 +742,7 @@ subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq,
nb
=
0
do
lc
=
1
,
n_cols
nrow
=
(
istep
-1
)
*
cwy_blocking
+
lc
! absolute number of pivot row
nrow
=
(
istep
-1
)
*
nbw
+
lc
! absolute number of pivot row
l_rows
=
local_index
(
nrow
-1
,
my_prow
,
np_rows
,
nblk
,
-1
)
! row length for bcast
hvm
(
1
:
l_rows
,
lc
)
=
hvb
(
nb
+1
:
nb
+
l_rows
)
...
...
@@ -758,40 +751,26 @@ subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq,
nb
=
nb
+
l_rows
enddo
l_rows
=
local_index
(
MIN
(
na
,(
istep
+1
)
*
cwy_blocking
),
my_prow
,
np_rows
,
nblk
,
-1
)
! compute tmat2 out of tmat(:,:,)
tmat_complete
=
0
do
i
=
1
,
t_blocking
t_cols
=
MIN
(
nbw
,
n_cols
-
(
i
-1
)
*
nbw
)
if
(
t_cols
<=
0
)
exit
t_rows
=
(
i
-
1
)
*
nbw
tmat_complete
(
t_rows
+1
:
t_rows
+
t_cols
,
t_rows
+1
:
t_rows
+
t_cols
)
=
tmat
(
1
:
t_cols
,
1
:
t_cols
,(
istep
-1
)
*
t_blocking
+
i
)
if
(
i
>
1
)
then
call
dgemm
(
'T'
,
'N'
,
t_rows
,
t_cols
,
l_rows
,
1.d0
,
hvm
(
1
,
1
),
max_local_rows
,
hvm
(
1
,(
i
-1
)
*
nbw
+1
),
max_local_rows
,
0.d0
,
t_tmp
,
cwy_blocking
)
call
mpi_allreduce
(
t_tmp
,
t_tmp2
,
cwy_blocking
*
nbw
,
MPI_REAL8
,
MPI_SUM
,
mpi_comm_rows
,
mpierr
)
call
dtrmm
(
'L'
,
'U'
,
'N'
,
'N'
,
t_rows
,
t_cols
,
1.0d0
,
tmat_complete
,
cwy_blocking
,
t_tmp2
,
cwy_blocking
)
call
dtrmm
(
'R'
,
'U'
,
'N'
,
'N'
,
t_rows
,
t_cols
,
-1.0d0
,
tmat_complete
(
t_rows
+1
,
t_rows
+1
),
cwy_blocking
,
t_tmp2
,
cwy_blocking
)
tmat_complete
(
1
:
t_rows
,
t_rows
+1
:
t_rows
+
t_cols
)
=
t_tmp2
(
1
:
t_rows
,
1
:
t_cols
)
endif
enddo
l_rows
=
local_index
(
MIN
(
na
,(
istep
+1
)
*
nbw
),
my_prow
,
np_rows
,
nblk
,
-1
)
! Q = Q - V * T**T * V**T * Q
if
(
l_rows
>
0
)
then
call
dgemm
(
'T'
,
'N'
,
n_cols
,
l_cols
,
l_rows
,
1.d0
,
hvm
,
ubound
(
hvm
,
1
),
q
,
ldq
,
0.d0
,
tmp1
,
n_cols
)
call
dgemm
(
'T'
,
'N'
,
n_cols
,
l_cols
,
l_rows
,
1.d0
,
hvm
,
ubound
(
hvm
,
1
),
&
q
,
ldq
,
0.d0
,
tmp1
,
n_cols
)
else
tmp1
(
1
:
l_cols
*
n_cols
)
=
0
endif
call
mpi_allreduce
(
tmp1
,
tmp2
,
n_cols
*
l_cols
,
MPI_REAL8
,
MPI_SUM
,
mpi_comm_rows
,
mpierr
)
if
(
l_rows
>
0
)
then
call
dtrmm
(
'L'
,
'U'
,
'T'
,
'N'
,
n_cols
,
l_cols
,
1.0d0
,
tmat_complete
,
cwy_blocking
,
tmp2
,
n_cols
)
call
dgemm
(
'N'
,
'N'
,
l_rows
,
l_cols
,
n_cols
,
-1.d0
,
hvm
,
ubound
(
hvm
,
1
),
tmp2
,
n_cols
,
1.d0
,
q
,
ldq
)
call
dtrmm
(
'L'
,
'U'
,
'T'
,
'N'
,
n_cols
,
l_cols
,
1.0d0
,
tmat
(
1
,
1
,
istep
),
ubound
(
tmat
,
1
),
tmp2
,
n_cols
)
call
dgemm
(
'N'
,
'N'
,
l_rows
,
l_cols
,
n_cols
,
-1.d0
,
hvm
,
ubound
(
hvm
,
1
),
&
tmp2
,
n_cols
,
1.d0
,
q
,
ldq
)
endif
enddo
deallocate
(
tmp1
,
tmp2
,
hvb
,
hvm
,
tmat_complete
,
t_tmp
,
t_tmp2
)
deallocate
(
tmp1
,
tmp2
,
hvb
,
hvm
)
end
subroutine
trans_ev_band_to_full_real
...
...
@@ -2208,9 +2187,9 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld
integer
l_cols
,
l_rows
,
l_colh
,
n_cols
integer
istep
,
lc
,
ncol
,
nrow
,
nb
,
ns
complex
*
16
,
allocatable
::
tmp1
(:),
tmp2
(:),
hvb
(:),
hvm
(:,:)
,
tmat_complete
(:,:),
t_tmp
(:,:),
t_tmp2
(:,:)
complex
*
16
,
allocatable
::
tmp1
(:),
tmp2
(:),
hvb
(:),
hvm
(:,:)
integer
pcol
,
prow
,
i
,
cwy_blocking
,
t_blocking
,
t_cols
,
t_rows
integer
pcol
,
prow
,
i
pcol
(
i
)
=
MOD
((
i
-1
)/
nblk
,
np_cols
)
!Processor col for global col number
prow
(
i
)
=
MOD
((
i
-1
)/
nblk
,
np_rows
)
!Processor row for global row number
...
...
@@ -2220,32 +2199,25 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld
call
mpi_comm_rank
(
mpi_comm_cols
,
my_pcol
,
mpierr
)
call
mpi_comm_size
(
mpi_comm_cols
,
np_cols
,
mpierr
)
max_blocks_row
=
((
na
-1
)/
nblk
)/
np_rows
+
1
! Rows of A
max_blocks_col
=
((
nqc
-1
)/
nblk
)/
np_cols
+
1
! Columns of q!
max_local_rows
=
max_blocks_row
*
nblk
max_local_cols
=
max_blocks_col
*
nblk
t_blocking
=
2
! number of matrices T (tmat) which are aggregated into a new (larger) T matrix (tmat_complete) and applied at once
cwy_blocking
=
t_blocking
*
nbw
allocate
(
tmp1
(
max_local_cols
*
cwy_blocking
))
allocate
(
tmp2
(
max_local_cols
*
cwy_blocking
))
allocate
(
hvb
(
max_local_rows
*
cwy_blocking
))
allocate
(
hvm
(
max_local_rows
,
cwy_blocking
))
allocate
(
tmat_complete
(
cwy_blocking
,
cwy_blocking
))
allocate
(
t_tmp
(
cwy_blocking
,
nbw
))
allocate
(
t_tmp2
(
cwy_blocking
,
nbw
))
allocate
(
tmp1
(
max_local_cols
*
nbw
))
allocate
(
tmp2
(
max_local_cols
*
nbw
))
allocate
(
hvb
(
max_local_rows
*
nbw
))
allocate
(
hvm
(
max_local_rows
,
nbw
))
hvm
=
0
! Must be set to 0 !!!
hvb
=
0
! Safety only
l_cols
=
local_index
(
nqc
,
my_pcol
,
np_cols
,
nblk
,
-1
)
! Local columns of q
do
istep
=
1
,(
(
na
-1
)/
nbw
-1
)/
t_blocking
+
1
do
istep
=
1
,(
na
-1
)/
nbw
n_cols
=
MIN
(
na
,
istep
*
cwy_blocking
+
nbw
)
-
(
istep
-1
)
*
cwy_blocking
-
nbw
! Number of columns in current step
n_cols
=
MIN
(
na
,
(
istep
+1
)
*
nbw
)
-
istep
*
nbw
! Number of columns in current step
! Broadcast all Householder vectors for current step compressed in hvb
...
...
@@ -2253,7 +2225,7 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld
ns
=
0
do
lc
=
1
,
n_cols
ncol
=
(
istep
-1
)
*
cwy_blocking
+
nbw
+
lc
! absolute column number of householder vector
ncol
=
istep
*
nbw
+
lc
! absolute column number of householder vector
nrow
=
ncol
-
nbw
! absolute number of pivot row
l_rows
=
local_index
(
nrow
-1
,
my_prow
,
np_rows
,
nblk
,
-1
)
! row length for bcast
...
...
@@ -2273,7 +2245,7 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld
nb
=
0
do
lc
=
1
,
n_cols
nrow
=
(
istep
-1
)
*
cwy_blocking
+
lc
! absolute number of pivot row
nrow
=
(
istep
-1
)
*
nbw
+
lc
! absolute number of pivot row
l_rows
=
local_index
(
nrow
-1
,
my_prow
,
np_rows
,
nblk
,
-1
)
! row length for bcast
hvm
(
1
:
l_rows
,
lc
)
=
hvb
(
nb
+1
:
nb
+
l_rows
)
...
...
@@ -2282,25 +2254,9 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld
nb
=
nb
+
l_rows
enddo
l_rows
=
local_index
(
MIN
(
na
,(
istep
+1
)
*
cwy_blocking
),
my_prow
,
np_rows
,
nblk
,
-1
)
l_rows
=
local_index
(
MIN
(
na
,(
istep
+1
)
*
nbw
),
my_prow
,
np_rows
,
nblk
,
-1
)
! Q = Q - V * T**T * V**T * Q
! compute tmat2 out of tmat(:,:,)
tmat_complete
(:,:)
=
CZERO
do
i
=
1
,
t_blocking
t_cols
=
MIN
(
nbw
,
n_cols
-
(
i
-1
)
*
nbw
)
if
(
t_cols
<=
0
)
exit
t_rows
=
(
i
-
1
)
*
nbw
tmat_complete
(
t_rows
+1
:
t_rows
+
t_cols
,
t_rows
+1
:
t_rows
+
t_cols
)
=
tmat
(
1
:
t_cols
,
1
:
t_cols
,(
istep
-1
)
*
t_blocking
+
i
)
if
(
i
>
1
)
then
call
zgemm
(
'C'
,
'N'
,
t_rows
,
t_cols
,
l_rows
,
CONE
,
hvm
(
1
,
1
),
max_local_rows
,
hvm
(
1
,(
i
-1
)
*
nbw
+1
),
max_local_rows
,
CZERO
,
t_tmp
,
cwy_blocking
)
call
mpi_allreduce
(
t_tmp
,
t_tmp2
,
cwy_blocking
*
nbw
,
MPI_DOUBLE_COMPLEX
,
MPI_SUM
,
mpi_comm_rows
,
mpierr
)
call
ztrmm
(
'L'
,
'U'
,
'N'
,
'N'
,
t_rows
,
t_cols
,
CONE
,
tmat_complete
,
cwy_blocking
,
t_tmp2
,
cwy_blocking
)
call
ztrmm
(
'R'
,
'U'
,
'N'
,
'N'
,
t_rows
,
t_cols
,
-
CONE
,
tmat_complete
(
t_rows
+1
,
t_rows
+1
),
cwy_blocking
,
t_tmp2
,
cwy_blocking
)
tmat_complete
(
1
:
t_rows
,
t_rows
+1
:
t_rows
+
t_cols
)
=
t_tmp2
(
1
:
t_rows
,
1
:
t_cols
)
endif
enddo
if
(
l_rows
>
0
)
then
call
zgemm
(
'C'
,
'N'
,
n_cols
,
l_cols
,
l_rows
,
CONE
,
hvm
,
ubound
(
hvm
,
1
),
&
...
...
@@ -2310,14 +2266,14 @@ subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ld
endif
call
mpi_allreduce
(
tmp1
,
tmp2
,
n_cols
*
l_cols
,
MPI_DOUBLE_COMPLEX
,
MPI_SUM
,
mpi_comm_rows
,
mpierr
)
if
(
l_rows
>
0
)
then
call
ztrmm
(
'L'
,
'U'
,
'C'
,
'N'
,
n_cols
,
l_cols
,
CONE
,
tmat
_complete
,
cwy_blocking
,
tmp2
,
n_cols
)
call
ztrmm
(
'L'
,
'U'
,
'C'
,
'N'
,
n_cols
,
l_cols
,
CONE
,
tmat
(
1
,
1
,
istep
),
ubound
(
tmat
,
1
)
,
tmp2
,
n_cols
)
call
zgemm
(
'N'
,
'N'
,
l_rows
,
l_cols
,
n_cols
,
-
CONE
,
hvm
,
ubound
(
hvm
,
1
),
&
tmp2
,
n_cols
,
CONE
,
q
,
ldq
)
endif
enddo
deallocate
(
tmp1
,
tmp2
,
hvb
,
hvm
,
tmat_complete
,
t_tmp
,
t_tmp2
)
deallocate
(
tmp1
,
tmp2
,
hvb
,
hvm
)
end
subroutine
trans_ev_band_to_full_complex
...
...
ELPA_2013.02.BETA/src/elpa2.f90_save
0 → 100644
View file @
85b2b68c
This diff is collapsed.
Click to expand it.
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment