Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
9655c781
Commit
9655c781
authored
May 30, 2012
by
Thomas Auckenthaler
Browse files
"use mpi" replaced by "include 'mpif.h'"
dropped old rank-2 QR-decomposition (blockedQR.f90)
parent
db110dfc
Changes
6
Hide whitespace changes
Inline
Side-by-side
ELPA_development_version/src/blockedQR.f90
deleted
100644 → 0
View file @
db110dfc
!
module
blockedQR
!
implicit
none
PRIVATE
public
::
qr_rank2_real
real
*
8
dnrm2
,
ddot
EXTERNAL
dnrm2
,
dgemv
,
dscal
,
dger
,
daxpy
,
ddot
contains
subroutine
qr_rank2_real
(
a
,
lda
,
vmr
,
ldv
,
tmat
,
nbw
,
istep
,
cols
,
nblk
,
mpi_comm_rows
,
mpi_comm_cols
,
work
,
eps
)
use
mpi
implicit
none
integer
lda
,
ldv
,
nbw
,
nblk
,
mpi_comm_rows
,
mpi_comm_cols
,
cols
,
istep
real
*
8
::
a
(
lda
,
*
),
vmr
(
ldv
,
*
),
tmat
(
nbw
,
nbw
,
*
),
work
(
*
)
real
*
8
eps
! matrix organization
integer
ncol
integer
firstcol
integer
nrow
integer
local_rows
integer
local_columns
integer
top11_row
integer
top11_col
integer
top22_row
integer
top22_col
integer
top11_column_comm_rank
integer
top11_row_comm_rank
integer
top22_column_comm_rank
integer
top22_row_comm_rank
integer
row
integer
n_cols
real
*
8
top11
real
*
8
top12
real
*
8
top21
real
*
8
top22
real
*
8
tau1
real
*
8
tau2
real
*
8
beta1
real
*
8
beta2
real
*
8
alpha
real
*
8
z2_div_tau1
real
*
8
z2
real
*
8
dot_second_second
real
*
8
dot_first_first
real
*
8
dot_first_second
! MPI stuff
integer
column_comm_rank
integer
column_processes
integer
row_comm_rank
integer
row_processes
integer
mpierr
! BLAS stuff
integer
blas_rows
integer
blas_columns
n_cols
=
cols
! in column communicator
call
mpi_comm_rank
(
mpi_comm_rows
,
column_comm_rank
,
mpierr
)
call
mpi_comm_size
(
mpi_comm_rows
,
column_processes
,
mpierr
)
! in row communicator
call
mpi_comm_rank
(
mpi_comm_cols
,
row_comm_rank
,
mpierr
)
call
mpi_comm_size
(
mpi_comm_cols
,
row_processes
,
mpierr
)
firstcol
=
istep
*
nbw
+
1
! apply 2-blocked QR */
do
while
(
n_cols
>=
2
)
ncol
=
istep
*
nbw
+
n_cols
nrow
=
ncol
-
nbw
if
(
nrow
==
1
)
return
! nothing left to do
! figure out processes with "top" elements
call
mapEntrytoProcess
(
column_processes
,
0
,
nrow
-1
,
0
,
nblk
,
top11_row
,
top11_column_comm_rank
)
call
mapEntrytoProcess
(
row_processes
,
0
,
ncol
-1
,
0
,
nblk
,
top11_col
,
top11_row_comm_rank
)
call
mapEntrytoProcess
(
column_processes
,
0
,
nrow
-2
,
0
,
nblk
,
top22_row
,
top22_column_comm_rank
)
call
mapEntrytoProcess
(
row_processes
,
0
,
ncol
-2
,
0
,
nblk
,
top22_col
,
top22_row_comm_rank
)
! figure out size of sub matrix
call
matrixGridInfo
(
local_rows
,
nrow
,
nblk
,
column_processes
,
column_comm_rank
)
call
matrixGridInfo
(
local_columns
,
ncol
,
nblk
,
row_processes
,
row_comm_rank
)
call
matrixGridInfo
(
blas_columns
,
firstcol
-1
,
nblk
,
row_processes
,
row_comm_rank
)
local_columns
=
local_columns
-
blas_columns
if
((
top22_row_comm_rank
.ne.
top11_row_comm_rank
)
.or.
(
nrow
==
2
))
then
! needed vectors are spread across two different processor columns,
! or meaningful data is only available for one vector
! => falling back to single householder vector generation
if
(
top11_row_comm_rank
==
row_comm_rank
)
then
! calculate HV and tau
tau1
=
houseGen_reversed
(
a
(
1
,
top11_col
),
nrow
,
mpi_comm_rows
,
nblk
,
work
)
! store tau in seperate array
work
(
1
)
=
tau1
work
(
2
:
local_rows
+1
)
=
a
(
1
:
local_rows
,
top11_col
)
! update all following columns
local_columns
=
local_columns
-
1
endif
! broadcast HV and tau to other process columns
call
mpi_bcast
(
work
,
local_rows
+1
,
mpi_real8
,
top11_row_comm_rank
,
mpi_comm_cols
,
mpierr
)
tau1
=
work
(
1
)
tmat
(
n_cols
,
n_cols
,
istep
)
=
tau1
vmr
(
1
:
local_rows
,
n_cols
)
=
work
(
2
:
local_rows
+1
)
if
(
top11_column_comm_rank
==
column_comm_rank
)
then
vmr
(
top11_row
,
n_cols
)
=
1.0
endif
! Apply HV to all other (local) columns
if
(
local_columns
>
0
)
then
call
houseLeft_reversed
(
vmr
(
1
,
n_cols
),
ldv
,
nrow
,
tau1
,
a
(
1
,
blas_columns
+1
),
lda
,
local_columns
,
mpi_comm_rows
,
nblk
,
work
)
endif
! go to next column
n_cols
=
n_cols
-
1
cycle
endif
call
matrixGridInfo
(
blas_rows
,
nrow
-1
,
nblk
,
column_processes
,
column_comm_rank
)
! matrix without top11/top12 elements
if
(
top11_row_comm_rank
==
row_comm_rank
)
then
! both vectors are available in current process column
! figure out size of sub matrix
! first collect needed data
if
(
top11_column_comm_rank
==
column_comm_rank
)
then
! i got the top elements of the current column */
top12
=
a
(
top11_row
,
top22_col
)
top11
=
a
(
top11_row
,
top22_col
+1
)
else
! no special vector data
top12
=
0.0
top11
=
0.0
endif
if
(
top22_column_comm_rank
==
column_comm_rank
)
then
top22
=
a
(
top22_row
,
top22_col
)
top21
=
a
(
top22_row
,
top22_col
+1
)
else
! no special vector data
top22
=
0.0
top21
=
0.0
endif
if
(
blas_rows
>
0
)
then
! only calculate if there is actual data,
! but still take part in Allreduce later on */
dot_second_second
=
dot_product
(
a
(
1
:
blas_rows
,
top22_col
),
a
(
1
:
blas_rows
,
top22_col
))
dot_first_first
=
dot_product
(
a
(
1
:
blas_rows
,
top22_col
+1
),
a
(
1
:
blas_rows
,
top22_col
+1
))
dot_first_second
=
dot_product
(
a
(
1
:
blas_rows
,
top22_col
),
a
(
1
:
blas_rows
,
top22_col
+1
))
else
dot_second_second
=
0.0
dot_first_first
=
0.0
dot_first_second
=
0.0
endif
! prepare allreduce buffer
work
(
1
)
=
top11
work
(
2
)
=
top21
work
(
3
)
=
top12
work
(
4
)
=
top22
work
(
5
)
=
dot_first_first
work
(
6
)
=
dot_second_second
work
(
7
)
=
dot_first_second
work
(
8
)
=
0.0
! alignment
! step 1.5: allreduce data: dot_first_first, dot_second_second, dot_first_second, top11, top12, top21, top22, <null> (alignment)
call
mpi_allreduce
(
work
(
1
),
work
(
9
),
8
,
mpi_real8
,
MPI_SUM
,
mpi_comm_rows
,
mpierr
)
! get reduced values
top11
=
work
(
9
)
top21
=
work
(
10
)
top12
=
work
(
11
)
top22
=
work
(
12
)
dot_first_first
=
work
(
13
)
dot_second_second
=
work
(
14
)
dot_first_second
=
work
(
15
)
! step 2: build householder vectors (completely independent)
! step 2.1: build beta1
beta1
=
dot_first_first
+
top11
*
top11
if
(
top11
<
0.0
)
then
beta1
=
-
sqrt
(
beta1
)
else
beta1
=
sqrt
(
beta1
)
endif
if
(
abs
((
dot_first_second
+
top11
*
top12
)
*
(
dot_first_second
+
top11
*
top12
)/((
dot_first_first
+
top11
*
top11
)
*
(
dot_second_second
+
top12
*
top12
)))
>
(
eps
/(
1.0d0
+
eps
)))
then
tau1
=
top11
+
beta1
! update columns and build householder vector
if
(
top11_column_comm_rank
==
column_comm_rank
)
then
a
(
top11_row
,
top22_col
+1
)
=-
beta1
endif
! daxpy emulation, seems to be more efficient than actual daxpy
! call
alpha
=
1.0
/
tau1
a
(
1
:
blas_rows
,
top22_col
+1
)
=
a
(
1
:
blas_rows
,
top22_col
+1
)
*
alpha
tau1
=
tau1
/
beta1
! apply houseleft operation to next column
! build y' * <second column> */
if
(
top11_column_comm_rank
==
column_comm_rank
)
then
dot_first_second
=
a
(
top11_row
,
top22_col
)
else
dot_first_second
=
0.0
endif
do
row
=
1
,
blas_rows
dot_first_second
=
dot_first_second
+
a
(
row
,
top22_col
)
*
a
(
row
,
top22_col
+1
)
enddo
work
(
1
)
=
dot_first_second
call
mpi_allreduce
(
work
(
1
),
work
(
3
),
1
,
mpi_real8
,
MPI_SUM
,
mpi_comm_rows
,
mpierr
)
dot_first_second
=
work
(
3
)
dot_first_second
=
dot_first_second
*
tau1
! update column and build dot_second_second on the fly */
if
(
top11_column_comm_rank
==
column_comm_rank
)
then
a
(
top11_row
,
top22_col
)
=
a
(
top11_row
,
top22_col
)
-
dot_first_second
endif
dot_second_second
=
0.0
do
row
=
1
,
blas_rows
! use top22 as temp variable
top22
=
a
(
row
,
top22_col
)
-
a
(
row
,
top22_col
+1
)
*
dot_first_second
dot_second_second
=
dot_second_second
+
top22
*
top22
a
(
row
,
top22_col
)
=
top22
enddo
! exchange top22 and dot_second_second
if
(
top22_column_comm_rank
==
column_comm_rank
)
then
work
(
1
)
=
a
(
top22_row
,
top22_col
)
else
work
(
1
)
=
0.0
endif
work
(
2
)
=
dot_second_second
call
mpi_allreduce
(
work
(
1
),
work
(
3
),
2
,
mpi_real8
,
MPI_SUM
,
mpi_comm_rows
,
mpierr
)
top22
=
work
(
3
)
dot_second_second
=
work
(
4
)
! build second householder vector
if
(
top22
<
0.0
)
then
beta2
=
-
sqrt
(
dot_second_second
)
else
beta2
=
sqrt
(
dot_second_second
)
endif
tau2
=
beta2
+
top22
if
(
top22_column_comm_rank
==
column_comm_rank
)
then
a
(
top22_row
,
top22_col
)
=
-
beta2
blas_rows
=
blas_rows
-
1
endif
alpha
=
1.0
/
tau2
do
row
=
1
,
blas_rows
a
(
row
,
top22_col
)
=
a
(
row
,
top22_col
)
*
alpha
enddo
tau2
=
tau2
/
beta2
else
! step 2.2: build temporary tau1 and tau2 (version 1)
tau1
=
top11
+
beta1
z2
=
dot_first_second
/
beta1
+
top12
*
(
1
+
top11
/
beta1
)
z2_div_tau1
=
z2
/
tau1
tau2
=
top22
-
z2_div_tau1
*
top21
! step 2.3: build beta2
beta2
=
dot_second_second
beta2
=
beta2
-
(
z2_div_tau1
)
*
(
2.0
*
dot_first_second
-
(
z2_div_tau1
)
*
(
dot_first_first
))
if
(
tau2
<
0.0
)
then
beta2
=
-
sqrt
(
beta2
)
else
beta2
=
sqrt
(
beta2
)
endif
! step 2.4: build temporary tau2 (version 2)
tau2
=
tau2
+
beta2
! step 2.5: update columns and build householder vectors
if
(
top11_column_comm_rank
==
column_comm_rank
)
then
a
(
top11_row
,
top22_col
+1
)
=
-
beta1
endif
! daxpy emulation, seems to be more efficient than actual daxpy
! call
alpha
=
1.0
/
tau1
a
(
1
:
blas_rows
,
top22_col
+1
)
=
a
(
1
:
blas_rows
,
top22_col
+1
)
*
alpha
! step 2.6.1: build real tau1
tau1
=
tau1
/
beta1
if
(
top11_column_comm_rank
==
column_comm_rank
)
then
a
(
top11_row
,
top22_col
)
=
top12
-
z2
endif
! handle case with zero elements in second vector
if
(
top22_column_comm_rank
==
column_comm_rank
)
then
a
(
top22_row
,
top22_col
)
=
-
beta2
blas_rows
=
blas_rows
-
1
endif
! perform a special "daxpy" which uses 2 scalars and 2 vectors
alpha
=
1.0
/
tau2
do
row
=
1
,
blas_rows
a
(
row
,
top22_col
)
=
(
a
(
row
,
top22_col
)
-
a
(
row
,
top22_col
+1
)
*
z2
)
*
alpha
enddo
! step 2.6.2: build real tau2, and add both tau values to their vector
tau2
=
tau2
/
beta2
endif
! fallback check
work
(
1
)
=
tau1
work
(
2
)
=
tau2
endif
! END of householder vector generation */
! step 2.7: broadcast householder parts to other process columns
if
(
local_rows
>
0
)
then
! check if we are the column with the householder vectors,
! otherwise the processes copy their stuff just to lose it after broadcast */
if
(
top11_row_comm_rank
==
row_comm_rank
)
then
! there is valuable householder vector data to broadcast
work
(
3
:
local_rows
+2
)
=
a
(
1
:
local_rows
,
top22_col
)
! second householder vector
work
(
local_rows
+3
:
2
*
local_rows
+2
)
=
a
(
1
:
local_rows
,
top22_col
+1
)
! first householder vector
! sanitize vectors by inserting zeros and ones
! (but only on the process with the top elements)
if
(
top11_column_comm_rank
==
column_comm_rank
)
then
work
(
2
+
top11_row
)
=
0.0
work
(
2
+
local_rows
+
top11_row
)
=
1.0
endif
if
(
top22_column_comm_rank
==
column_comm_rank
)
then
work
(
2
+
top22_row
)
=
1.0
endif
endif
endif
call
mpi_bcast
(
work
,
2+2
*
local_rows
,
mpi_real8
,
top11_row_comm_rank
,
mpi_comm_cols
,
mpierr
)
tau1
=
work
(
1
)
tau2
=
work
(
2
)
if
(
local_rows
>
0
)
then
vmr
(
1
:
local_rows
,
n_cols
-1
)
=
work
(
3
:
local_rows
+2
)
vmr
(
1
:
local_rows
,
n_cols
)
=
work
(
3
+
local_rows
:
2
*
local_rows
+2
)
endif
! store tau values in future T matrix
tmat
(
n_cols
-1
,
n_cols
-1
,
istep
)
=
tau2
tmat
(
n_cols
,
n_cols
,
istep
)
=
tau1
! if we produced the householder vectors, move to next 2 columns
if
(
top11_row_comm_rank
==
row_comm_rank
)
then
local_columns
=
local_columns
-2
endif
! time for updating the remaining columns, if there are any
if
(
local_columns
>
0
)
then
! fused approach
call
houseLeft_2rank_MPI_reversed
(
nrow
,
local_columns
,
nrow
,
a
(
1
,
blas_columns
+1
),
lda
,
vmr
(
1
,
n_cols
-1
),
ldv
,
work
,
mpi_comm_rows
,
nblk
,
work
(
3
))
! normal approach
!call houseLeft_reversed(vmr(1,n_cols), ldv, nrow, tau1, a(1,blas_columns+1), lda, local_columns, mpi_comm_rows, nblk, work(3))
!call houseLeft_reversed(vmr(1,n_cols-1), ldv, nrow, tau2, a(1,blas_columns+1), lda, local_columns, mpi_comm_rows, nblk, work(3))
endif
! advance to next two columns */
n_cols
=
n_cols
-
2
end
do
! handle last column the standard way
if
(
n_cols
.eq.
1
)
then
ncol
=
istep
*
nbw
+
n_cols
nrow
=
ncol
-
nbw
if
(
nrow
==
1
)
return
! nothing
! figure out size of sub matrix
call
matrixGridInfo
(
local_rows
,
nrow
,
nblk
,
column_processes
,
column_comm_rank
)
call
matrixGridInfo
(
local_columns
,
ncol
,
nblk
,
row_processes
,
row_comm_rank
)
! figure out process with "top" element
call
mapEntrytoProcess
(
column_processes
,
0
,
nrow
-1
,
0
,
nblk
,
top11_row
,
top11_column_comm_rank
)
call
mapEntrytoProcess
(
row_processes
,
0
,
ncol
-1
,
0
,
nblk
,
top11_col
,
top11_row_comm_rank
)
if
(
top11_row_comm_rank
==
row_comm_rank
)
then
! calculate HV and tau
tau1
=
houseGen_reversed
(
a
(
1
,
top11_col
),
nrow
,
mpi_comm_rows
,
nblk
,
work
)
! store tau in seperate array
work
(
1
)
=
tau1
work
(
2
:
local_rows
+1
)
=
a
(
1
:
local_rows
,
top11_col
)
endif
! broadcast HV and tau to other process columns
call
mpi_bcast
(
work
,
local_rows
+1
,
mpi_real8
,
top11_row_comm_rank
,
mpi_comm_cols
,
mpierr
)
tau1
=
work
(
1
)
tmat
(
n_cols
,
n_cols
,
istep
)
=
tau1
vmr
(
1
:
local_rows
,
n_cols
)
=
work
(
2
:
local_rows
+1
)
if
(
top11_column_comm_rank
==
column_comm_rank
)
then
vmr
(
top11_row
,
n_cols
)
=
1.0
endif
endif
end
subroutine
qr_rank2_real
subroutine
houseGen_scal
(
x
,
local_size
,
alpha
)
implicit
none
integer
local_size
real
*
8
alpha
,
x
(
*
)
call
dscal
(
local_size
,
alpha
,
x
,
1
)
!x(1:local_size) = x(1:local_size) * alpha
end
subroutine
houseGen_scal
real
*
8
function
houseGen_dlapy2
(
x
,
y
)
implicit
none
real
*
8
x
,
y
real
*
8
xabs
real
*
8
yabs
real
*
8
w
real
*
8
z
xabs
=
abs
(
x
)
yabs
=
abs
(
y
)
w
=
max
(
xabs
,
yabs
)
z
=
min
(
xabs
,
yabs
)
if
(
z
==
0.0
)
housegen_dlapy2
=
w
z
=
z
/
w
houseGen_dlapy2
=
w
*
sqrt
(
1.0
+
z
*
z
)
end
function
houseGen_dlapy2
!
! Determines the Householder Vector which sets the
! @param work: buffer, needs to be 2x process_count of size
! @param x pointer to part of build vector
! @param n size of total build vector
! @param b index from which all entries in the vector should be zeroed
! @param communicator MPI communicator over which the vector is spread
!
function
houseGen_reversed
(
x
,
b
,
communicator
,
blk
,
work
)
use
mpi
implicit
none
real
*
8
houseGen_reversed
real
*
8
x
(
*
),
work
(
*
)
integer
b
,
blk
,
communicator
real
*
8
beta
real
*
8
local_norm
real
*
8
xnorm
real
*
8
alpha
integer
alpha_index
integer
alpha_rank
real
*
8
tau
integer
my_grid_rank
integer
process_count
real
*
8
temp
real
*
8
safmin
real
*
8
rsafmn
integer
knt
integer
b_entry
integer
i
integer
local_size
integer
mpierr
! what rank am i and what data parts do i have?
call
mpi_comm_rank
(
communicator
,
my_grid_rank
,
mpierr
)
! how many processes are on this communicator
call
mpi_comm_size
(
communicator
,
process_count
,
mpierr
)
! max entries for this process
call
matrixGridInfo
(
local_size
,
b
,
blk
,
process_count
,
my_grid_rank
)
! calculate norm
local_norm
=
dnrm2
(
local_size
,
x
,
1
)
alpha
=
0.0
! determine alpha value (= first value in global vector part)
call
mapEntrytoProcess
(
process_count
,
0
,
b
-1
,
0
,
blk
,
alpha_index
,
alpha_rank
)
if
(
my_grid_rank
==
alpha_rank
)
alpha
=
x
(
alpha_index
)
! work buffer: first receive_bufffer, then send_buffer
do
i
=
1
,
process_count
work
(
2
*
i
-1
)
=
local_norm
work
(
2
*
i
)
=
alpha
enddo
! now exchange all local norms to all processes
call
mpi_alltoall
(
work
,
2
,
mpi_real8
,
work
(
2
*
process_count
+1
),
2
,
mpi_real8
,
communicator
,
mpierr
)
! process with alpha rank provides alpha value
alpha
=
work
(
2
*
process_count
+1+2
*
alpha_rank
+1
)
! copy local norms into right position
do
i
=
1
,
process_count
work
(
i
)
=
work
(
2
*
process_count
+2
*
i
-1
)
enddo
! now calculate global norm
xnorm
=
dnrm2
(
process_count
,
work
,
1
)
! use safe minimum as 0 indicator
!safmin = dlamch('S')
! General case
beta
=
xnorm
if
(
alpha
<
0.0
)
beta
=
-
beta
tau
=
(
beta
+
alpha
)
/
beta
call
houseGen_scal
(
x
,
local_size
,
1.0
/(
alpha
+
beta
))
alpha
=
-
beta
if
(
alpha_rank
==
my_grid_rank
)
x
(
alpha_index
)
=
alpha
houseGen_reversed
=
tau
end
function
houseGen_reversed
! maps array slots of one process and old block size to a new process with new block size
subroutine
mapEntrytoProcess
(
processes
,
nb_old
,
entry_old
,
rank_old
,
nb_new
,
entry_new
,
rank_new
)
implicit
none
integer
processes
,
nb_old
,
entry_old
,
rank_old
,
nb_new
,
entry_new
,
rank_new
integer
real_entry
if
((
nb_old
==
0
)
.and.
(
nb_new
==
0
))
then
entry_new
=
entry_old
rank_new
=
0
endif
if
(
nb_old
==
0
)
then
real_entry
=
entry_old
else
real_entry
=
rank_old
*
nb_old
+
(
entry_old
/
nb_old
)
*
(
nb_old
*
processes
)
+
modulo
(
entry_old
,
nb_old
)
endif
if
(
nb_new
==
0
)
then
rank_new
=
0