Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
elpa
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
11
Issues
11
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Environments
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
elpa
elpa
Commits
03d44007
Commit
03d44007
authored
Jul 27, 2020
by
Andreas Marek
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Some intendation
parent
16b9fbf6
Changes
1
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
407 additions
and
406 deletions
+407
-406
src/elpa2/elpa2_trans_ev_band_to_full_template.F90
src/elpa2/elpa2_trans_ev_band_to_full_template.F90
+407
-406
No files found.
src/elpa2/elpa2_trans_ev_band_to_full_template.F90
View file @
03d44007
...
...
@@ -51,7 +51,7 @@
#include "../general/sanity.F90"
subroutine
trans_ev_band_to_full_
&
subroutine
trans_ev_band_to_full_
&
&
MATH_DATATYPE
&
&
_
&
&
PRECISION
&
...
...
@@ -64,486 +64,487 @@
)
#endif
!-------------------------------------------------------------------------------
! trans_ev_band_to_full_real/complex:
! Transforms the eigenvectors of a band matrix back to the eigenvectors of the original matrix
!
! Parameters
!
! na Order of matrix a_mat, number of rows of matrix q_mat
!
! nqc Number of columns of matrix q_mat
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! nbw semi bandwith
!
! a_mat(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a_mat after bandred_real/complex)
! Distribution is like in Scalapack.
!
! lda Leading dimension of a_mat
! matrixCols local columns of matrix a_mat and q_mat
!
! tmat(nbw,nbw,numBlocks) Factors returned by bandred_real/complex
!
! q_mat On input: Eigenvectors of band matrix
! On output: Transformed eigenvectors
! Distribution is like in Scalapack.
!
! ldq Leading dimension of q_mat
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
!
!-------------------------------------------------------------------------------
use
precision
use
cuda_functions
use
iso_c_binding
use
elpa_abstract_impl
use
elpa_blas_interfaces
implicit
none
!-------------------------------------------------------------------------------
! trans_ev_band_to_full_real/complex:
! Transforms the eigenvectors of a band matrix back to the eigenvectors of the original matrix
!
! Parameters
!
! na Order of matrix a_mat, number of rows of matrix q_mat
!
! nqc Number of columns of matrix q_mat
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! nbw semi bandwith
!
! a_mat(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a_mat after bandred_real/complex)
! Distribution is like in Scalapack.
!
! lda Leading dimension of a_mat
! matrixCols local columns of matrix a_mat and q_mat
!
! tmat(nbw,nbw,numBlocks) Factors returned by bandred_real/complex
!
! q_mat On input: Eigenvectors of band matrix
! On output: Transformed eigenvectors
! Distribution is like in Scalapack.
!
! ldq Leading dimension of q_mat
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
!
!-------------------------------------------------------------------------------
use
precision
use
cuda_functions
use
iso_c_binding
use
elpa_abstract_impl
use
elpa_blas_interfaces
implicit
none
#include "../general/precision_kinds.F90"
class
(
elpa_abstract_impl_t
),
intent
(
inout
)
::
obj
logical
,
intent
(
in
)
::
useGPU
class
(
elpa_abstract_impl_t
),
intent
(
inout
)
::
obj
logical
,
intent
(
in
)
::
useGPU
#if REALCASE == 1
logical
,
intent
(
in
)
::
useQR
logical
,
intent
(
in
)
::
useQR
#endif
integer
(
kind
=
ik
)
::
na
,
nqc
,
lda
,
ldq
,
nblk
,
nbw
,
matrixCols
,
numBlocks
,
mpi_comm_rows
,
mpi_comm_cols
integer
(
kind
=
ik
)
::
na
,
nqc
,
lda
,
ldq
,
nblk
,
nbw
,
matrixCols
,
numBlocks
,
mpi_comm_rows
,
mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE
(
kind
=
rck
)
::
a_mat
(
lda
,
*
)
MATH_DATATYPE
(
kind
=
rck
)
::
q_mat
(
ldq
,
*
),
tmat
(
nbw
,
nbw
,
*
)
MATH_DATATYPE
(
kind
=
rck
)
::
a_mat
(
lda
,
*
)
MATH_DATATYPE
(
kind
=
rck
)
::
q_mat
(
ldq
,
*
),
tmat
(
nbw
,
nbw
,
*
)
#else
MATH_DATATYPE
(
kind
=
rck
)
::
a_mat
(
lda
,
matrixCols
)
MATH_DATATYPE
(
kind
=
rck
)
::
q_mat
(
ldq
,
matrixCols
),
tmat
(
nbw
,
nbw
,
numBlocks
)
MATH_DATATYPE
(
kind
=
rck
)
::
a_mat
(
lda
,
matrixCols
)
MATH_DATATYPE
(
kind
=
rck
)
::
q_mat
(
ldq
,
matrixCols
),
tmat
(
nbw
,
nbw
,
numBlocks
)
#endif
integer
(
kind
=
ik
)
::
my_prow
,
my_pcol
,
np_rows
,
np_cols
integer
(
kind
=
MPI_KIND
)
::
my_prowMPI
,
my_pcolMPI
,
np_rowsMPI
,
np_colsMPI
,
mpierr
integer
(
kind
=
ik
)
::
max_blocks_row
,
max_blocks_col
,
max_local_rows
,
&
max_local_cols
integer
(
kind
=
ik
)
::
l_cols
,
l_rows
,
l_colh
,
n_cols
integer
(
kind
=
ik
)
::
istep
,
lc
,
ncol
,
nrow
,
nb
,
ns
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
hvb
(:)
MATH_DATATYPE
(
kind
=
rck
),
pointer
::
hvm
(:,:),
tmp1
(:),
tmp2
(:)
! hvm_dev is fist used and set in this routine
! q_mat is changed in trans_ev_tridi on the host, copied to device and passed here. this can be adapted
! tmp_dev is first used in this routine
! tmat_dev is not passed along from bandred_real
integer
(
kind
=
C_intptr_T
)
::
hvm_dev
,
q_dev
,
tmp_dev
,
tmat_dev
type
(
c_ptr
)
::
hvm_host
,
tmp1_host
,
tmp2_host
integer
(
kind
=
ik
)
::
i
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
tmat_complete
(:,:),
t_tmp
(:,:),
t_tmp2
(:,:)
integer
(
kind
=
ik
)
::
t_cols
,
t_rows
integer
(
kind
=
ik
)
::
cwy_blocking
integer
(
kind
=
ik
)
::
istat
character
(
200
)
::
errorMessage
character
(
20
)
::
gpuString
logical
::
successCUDA
integer
(
kind
=
c_intptr_t
),
parameter
::
size_of_datatype
=
size_of_
&
&
PRECISION
&
&
_
&
&
MATH_DATATYPE
integer
(
kind
=
ik
)
::
blocking_factor
,
error
if
(
useGPU
)
then
gpuString
=
"_gpu"
else
gpuString
=
""
endif
call
obj
%
timer
%
start
(
"trans_ev_band_to_full_&
&MATH_DATATYPE&
&"
//
&
&
PRECISION_SUFFIX
//&
gpuString
)
integer
(
kind
=
ik
)
::
my_prow
,
my_pcol
,
np_rows
,
np_cols
integer
(
kind
=
MPI_KIND
)
::
my_prowMPI
,
my_pcolMPI
,
np_rowsMPI
,
np_colsMPI
,
mpierr
integer
(
kind
=
ik
)
::
max_blocks_row
,
max_blocks_col
,
max_local_rows
,
&
max_local_cols
integer
(
kind
=
ik
)
::
l_cols
,
l_rows
,
l_colh
,
n_cols
integer
(
kind
=
ik
)
::
istep
,
lc
,
ncol
,
nrow
,
nb
,
ns
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
hvb
(:)
MATH_DATATYPE
(
kind
=
rck
),
pointer
::
hvm
(:,:),
tmp1
(:),
tmp2
(:)
! hvm_dev is fist used and set in this routine
! q_mat is changed in trans_ev_tridi on the host, copied to device and passed here. this can be adapted
! tmp_dev is first used in this routine
! tmat_dev is not passed along from bandred_real
integer
(
kind
=
C_intptr_T
)
::
hvm_dev
,
q_dev
,
tmp_dev
,
tmat_dev
type
(
c_ptr
)
::
hvm_host
,
tmp1_host
,
tmp2_host
integer
(
kind
=
ik
)
::
i
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
tmat_complete
(:,:),
t_tmp
(:,:),
t_tmp2
(:,:)
integer
(
kind
=
ik
)
::
t_cols
,
t_rows
integer
(
kind
=
ik
)
::
cwy_blocking
integer
(
kind
=
ik
)
::
istat
character
(
200
)
::
errorMessage
character
(
20
)
::
gpuString
logical
::
successCUDA
integer
(
kind
=
c_intptr_t
),
parameter
::
size_of_datatype
=
size_of_
&
&
PRECISION
&
&
_
&
&
MATH_DATATYPE
integer
(
kind
=
ik
)
::
blocking_factor
,
error
,
blk_end
if
(
useGPU
)
then
gpuString
=
"_gpu"
else
gpuString
=
""
endif
call
obj
%
timer
%
start
(
"trans_ev_band_to_full_&
&MATH_DATATYPE&
&"
//
&
&
PRECISION_SUFFIX
//&
gpuString
)
#ifdef BAND_TO_FULL_BLOCKING
call
obj
%
get
(
"blocking_in_band_to_full"
,
blocking_factor
,
error
)
if
(
error
.ne.
ELPA_OK
)
then
print
*
,
"Problem getting option for blocking_in_band_to_full. Aborting..."
stop
endif
call
obj
%
get
(
"blocking_in_band_to_full"
,
blocking_factor
,
error
)
if
(
error
.ne.
ELPA_OK
)
then
print
*
,
"Problem getting option for blocking_in_band_to_full. Aborting..."
stop
endif
#else
blocking_factor
=
1
blocking_factor
=
1
#endif
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_comm_rank
(
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
)
,
my_prowMPI
,
mpierr
)
call
mpi_comm_size
(
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
)
,
np_rowsMPI
,
mpierr
)
call
mpi_comm_rank
(
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
)
,
my_pcolMPI
,
mpierr
)
call
mpi_comm_size
(
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
)
,
np_colsMPI
,
mpierr
)
my_prow
=
int
(
my_prowMPI
,
kind
=
c_int
)
my_pcol
=
int
(
my_pcolMPI
,
kind
=
c_int
)
np_rows
=
int
(
np_rowsMPI
,
kind
=
c_int
)
np_cols
=
int
(
np_colsMPI
,
kind
=
c_int
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
max_blocks_row
=
((
na
-1
)/
nblk
)/
np_rows
+
1
! Rows of a_mat
max_blocks_col
=
((
nqc
-1
)/
nblk
)/
np_cols
+
1
! Columns of q_mat!
max_local_rows
=
max_blocks_row
*
nblk
max_local_cols
=
max_blocks_col
*
nblk
cwy_blocking
=
blocking_factor
*
nbw
if
(
useGPU
)
then
! copy q_mat to q_dev
successCUDA
=
cuda_malloc
(
q_dev
,
ldq
*
matrixCols
*
size_of_datatype
)
check_alloc_cuda
(
"trans_ev_band_to_full: q_dev"
,
successCUDA
)
successCUDA
=
cuda_host_register
(
int
(
loc
(
q_mat
),
kind
=
c_intptr_t
),&
ldq
*
matrixCols
*
size_of_datatype
,
cudaHostRegisterDefault
)
check_host_register_cuda
(
"trans_ev_band_to_full: q_mat"
,
successCUDA
)
successCUDA
=
cuda_memcpy
(
q_dev
,
int
(
loc
(
q_mat
),
kind
=
c_intptr_t
),&
ldq
*
matrixCols
*
size_of_datatype
,
cudaMemcpyHostToDevice
)
check_memcpy_cuda
(
"trans_ev_band_to_full: q_mat -> q_dev"
,
successCUDA
)
successCUDA
=
cuda_malloc_host
(
tmp1_host
,
max_local_cols
*
cwy_blocking
*
size_of_datatype
)
check_host_alloc_cuda
(
"trans_ev_band_to_full: tmp1_host"
,
successCUDA
)
call
c_f_pointer
(
tmp1_host
,
tmp1
,
(/
max_local_cols
*
cwy_blocking
/))
successCUDA
=
cuda_malloc_host
(
tmp2_host
,
max_local_cols
*
cwy_blocking
*
size_of_datatype
)
check_host_alloc_cuda
(
"trans_ev_band_to_full: tmp2_host"
,
successCUDA
)
call
c_f_pointer
(
tmp2_host
,
tmp2
,
(/
max_local_cols
*
cwy_blocking
/))
successCUDA
=
cuda_malloc_host
(
hvm_host
,
max_local_rows
*
cwy_blocking
*
size_of_datatype
)
check_host_alloc_cuda
(
"trans_ev_band_to_full: hvm_host"
,
successCUDA
)
call
c_f_pointer
(
hvm_host
,
hvm
,
(/
max_local_rows
,
cwy_blocking
/))
else
! useGPU
allocate
(
tmp1
(
max_local_cols
*
cwy_blocking
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"trans_ev_band_to_full: tmp1"
,
istat
,
errorMessage
)
allocate
(
tmp2
(
max_local_cols
*
cwy_blocking
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"trans_ev_band_to_full: tmp2"
,
istat
,
errorMessage
)
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_comm_rank
(
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
)
,
my_prowMPI
,
mpierr
)
call
mpi_comm_size
(
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
)
,
np_rowsMPI
,
mpierr
)
call
mpi_comm_rank
(
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
)
,
my_pcolMPI
,
mpierr
)
call
mpi_comm_size
(
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
)
,
np_colsMPI
,
mpierr
)
my_prow
=
int
(
my_prowMPI
,
kind
=
c_int
)
my_pcol
=
int
(
my_pcolMPI
,
kind
=
c_int
)
np_rows
=
int
(
np_rowsMPI
,
kind
=
c_int
)
np_cols
=
int
(
np_colsMPI
,
kind
=
c_int
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
max_blocks_row
=
((
na
-1
)/
nblk
)/
np_rows
+
1
! Rows of a_mat
max_blocks_col
=
((
nqc
-1
)/
nblk
)/
np_cols
+
1
! Columns of q_mat!
max_local_rows
=
max_blocks_row
*
nblk
max_local_cols
=
max_blocks_col
*
nblk
cwy_blocking
=
blocking_factor
*
nbw
if
(
useGPU
)
then
! copy q_mat to q_dev
successCUDA
=
cuda_malloc
(
q_dev
,
ldq
*
matrixCols
*
size_of_datatype
)
check_alloc_cuda
(
"trans_ev_band_to_full: q_dev"
,
successCUDA
)
successCUDA
=
cuda_host_register
(
int
(
loc
(
q_mat
),
kind
=
c_intptr_t
),&
ldq
*
matrixCols
*
size_of_datatype
,
cudaHostRegisterDefault
)
check_host_register_cuda
(
"trans_ev_band_to_full: q_mat"
,
successCUDA
)
allocate
(
hvm
(
max_local_rows
,
cwy_blocking
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"trans_ev_band_to_full: hvm"
,
istat
,
errorMessag
e
)
endif
!useGPU
successCUDA
=
cuda_memcpy
(
q_dev
,
int
(
loc
(
q_mat
),
kind
=
c_intptr_t
),&
ldq
*
matrixCols
*
size_of_datatype
,
cudaMemcpyHostToDevic
e
)
check_memcpy_cuda
(
"trans_ev_band_to_full: q_mat -> q_dev"
,
successCUDA
)
allocate
(
hvb
(
max_local_rows
*
cwy_blocking
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"trans_ev_band_to_full: hvb"
,
istat
,
errorMessage
)
successCUDA
=
cuda_malloc_host
(
tmp1_host
,
max_local_cols
*
cwy_blocking
*
size_of_datatype
)
check_host_alloc_cuda
(
"trans_ev_band_to_full: tmp1_host"
,
successCUDA
)
call
c_f_pointer
(
tmp1_host
,
tmp1
,
(/
max_local_cols
*
cwy_blocking
/))
allocate
(
tmat_complete
(
cwy_blocking
,
cwy_blocking
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"trans_ev_band_to_full: tmat_complete"
,
istat
,
errorMessage
)
successCUDA
=
cuda_malloc_host
(
tmp2_host
,
max_local_cols
*
cwy_blocking
*
size_of_datatype
)
check_host_alloc_cuda
(
"trans_ev_band_to_full: tmp2_host"
,
successCUDA
)
call
c_f_pointer
(
tmp2_host
,
tmp2
,
(/
max_local_cols
*
cwy_blocking
/))
if
(
useGPU
)
then
successCUDA
=
cuda_host_register
(
int
(
loc
(
tmat_complete
),
kind
=
c_intptr_t
),
&
cwy_blocking
*
cwy_blocking
*
size_of_datatype
,&
cudaHostRegisterDefault
)
check_host_register_cuda
(
"trans_ev_band_to_full: tmat_complete"
,
successCUDA
)
endif
successCUDA
=
cuda_malloc_host
(
hvm_host
,
max_local_rows
*
cwy_blocking
*
size_of_datatype
)
check_host_alloc_cuda
(
"trans_ev_band_to_full: hvm_host"
,
successCUDA
)
call
c_f_pointer
(
hvm_host
,
hvm
,
(/
max_local_rows
,
cwy_blocking
/))
if
(
blocking_factor
>
1
)
then
allocate
(
t_tmp
(
cwy_blocking
,
nbw
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"trans_ev_band_to_full: t_tmp"
,
istat
,
errorMessage
)
allocate
(
t_tmp2
(
cwy_blocking
,
nbw
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"trans_ev_band_to_full: t_tmp2"
,
istat
,
errorMessage
)
endif
else
! useGPU
allocate
(
tmp1
(
max_local_cols
*
cwy_blocking
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"trans_ev_band_to_full: tmp1"
,
istat
,
errorMessage
)
if
(
useGPU
)
then
successCUDA
=
cuda_malloc
(
hvm_dev
,
max_local_rows
*
cwy_blocking
*
size_of_datatype
)
check_alloc_cuda
(
"trans_ev_band_to_full: hvm_dev"
,
successCUDA
)
allocate
(
tmp2
(
max_local_cols
*
cwy_blocking
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"trans_ev_band_to_full: tmp2"
,
istat
,
errorMessage
)
successCUDA
=
cuda_malloc
(
tmp_dev
,
max_local_cols
*
cwy_blocking
*
size_of_datatype
)
check_alloc_cuda
(
"trans_ev_band_to_full: tmp_dev"
,
successCUDA
)
allocate
(
hvm
(
max_local_rows
,
cwy_blocking
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"trans_ev_band_to_full: hvm"
,
istat
,
errorMessage
)
endif
!useGPU
successCUDA
=
cuda_malloc
(
tmat_dev
,
cwy_blocking
*
cwy_blocking
*
size_of_datatype
)
check_alloc_cuda
(
"trans_ev_band_to_full: tmat_dev"
,
successCUDA
)
endif
allocate
(
hvb
(
max_local_rows
*
cwy_blocking
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"trans_ev_band_to_full: hvb"
,
istat
,
errorMessage
)
hvm
=
0.0_rck
! Must be set to 0 !!!
hvb
=
0.0_rck
! Safety only
tmp1
=
0.0_rck
tmp2
=
0.0_rck
tmat_complete
=
0.0_rck
if
(
blocking_factor
>
1
)
then
t_tmp
=
0.0_rck
! Must be set to 0 !!!
t_tmp2
=
0.0_rck
endif
l_cols
=
local_index
(
nqc
,
my_pcol
,
np_cols
,
nblk
,
-1
)
! Local columns of q_mat
allocate
(
tmat_complete
(
cwy_blocking
,
cwy_blocking
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"trans_ev_band_to_full: tmat_complete"
,
istat
,
errorMessage
)
do
istep
=
1
,((
na
-1
)/
nbw
-1
)/
blocking_factor
+
1
if
(
useGPU
)
then
successCUDA
=
cuda_host_register
(
int
(
loc
(
tmat_complete
),
kind
=
c_intptr_t
),
&
cwy_blocking
*
cwy_blocking
*
size_of_datatype
,&
cudaHostRegisterDefault
)
check_host_register_cuda
(
"trans_ev_band_to_full: tmat_complete"
,
successCUDA
)
endif
! This the call when using na >= ((blocking_factor+1)*nbw)
! n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw
! Number of columns in current step
! As an alternative we add some special case handling if na < cwy_blocking
if
(
na
<
cwy_blocking
)
then
n_cols
=
MAX
(
0
,
na
-
nbw
)
if
(
n_cols
.eq.
0
)
then
exit
end
if
else
n_cols
=
MIN
(
na
,
istep
*
cwy_blocking
+
nbw
)
-
(
istep
-1
)
*
cwy_blocking
-
nbw
! Number of columns in current step
end
if
if
(
blocking_factor
>
1
)
then
allocate
(
t_tmp
(
cwy_blocking
,
nbw
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"trans_ev_band_to_full: t_tmp"
,
istat
,
errorMessage
)
! Broadcast all Householder vectors for current step compressed in hvb
allocate
(
t_tmp2
(
cwy_blocking
,
nbw
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"trans_ev_band_to_full: t_tmp2"
,
istat
,
errorMessage
)
endif
nb
=
0
ns
=
0
if
(
useGPU
)
then
successCUDA
=
cuda_malloc
(
hvm_dev
,
max_local_rows
*
cwy_blocking
*
size_of_datatype
)
check_alloc_cuda
(
"trans_ev_band_to_full: hvm_dev"
,
successCUDA
)
do
lc
=
1
,
n_cols
ncol
=
(
istep
-1
)
*
cwy_blocking
+
nbw
+
lc
! absolute column number of householder Vector
nrow
=
ncol
-
nbw
! absolute number of pivot row
successCUDA
=
cuda_malloc
(
tmp_dev
,
max_local_cols
*
cwy_blocking
*
size_of_datatype
)
check_alloc_cuda
(
"trans_ev_band_to_full: tmp_dev"
,
successCUDA
)
l_rows
=
local_index
(
nrow
-1
,
my_prow
,
np_rows
,
nblk
,
-1
)
! row length for bcast
l_colh
=
local_index
(
ncol
,
my_pcol
,
np_cols
,
nblk
,
-1
)
! HV local column number
successCUDA
=
cuda_malloc
(
tmat_dev
,
cwy_blocking
*
cwy_blocking
*
size_of_datatype
)
check_alloc_cuda
(
"trans_ev_band_to_full: tmat_dev"
,
successCUDA
)
endif
if
(
my_pcol
==
pcol
(
ncol
,
nblk
,
np_cols
))
hvb
(
nb
+1
:
nb
+
l_rows
)
=
a_mat
(
1
:
l_rows
,
l_colh
)
hvm
=
0.0_rck
! Must be set to 0 !!!
hvb
=
0.0_rck
! Safety only
tmp1
=
0.0_rck
tmp2
=
0.0_rck
tmat_complete
=
0.0_rck
if
(
blocking_factor
>
1
)
then
t_tmp
=
0.0_rck
! Must be set to 0 !!!
t_tmp2
=
0.0_rck
endif
l_cols
=
local_index
(
nqc
,
my_pcol
,
np_cols
,
nblk
,
-1
)
! Local columns of q_mat
nb
=
nb
+
l_rows
blk_end
=
((
na
-1
)/
nbw
-1
)/
blocking_factor
+
1
do
istep
=
1
,
blk_end
! This the call when using na >= ((blocking_factor+1)*nbw)
! n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw
! Number of columns in current step
! As an alternative we add some special case handling if na < cwy_blocking
if
(
na
<
cwy_blocking
)
then
n_cols
=
MAX
(
0
,
na
-
nbw
)
if
(
n_cols
.eq.
0
)
then
exit
end
if
else
n_cols
=
MIN
(
na
,
istep
*
cwy_blocking
+
nbw
)
-
(
istep
-1
)
*
cwy_blocking
-
nbw
! Number of columns in current step
end
if
if
(
lc
==
n_cols
.or.
mod
(
ncol
,
nblk
)
==
0
)
then
! Broadcast all Householder vectors for current step compressed in hvb
nb
=
0
ns
=
0
do
lc
=
1
,
n_cols
ncol
=
(
istep
-1
)
*
cwy_blocking
+
nbw
+
lc
! absolute column number of householder Vector
nrow
=
ncol
-
nbw
! absolute number of pivot row
l_rows
=
local_index
(
nrow
-1
,
my_prow
,
np_rows
,
nblk
,
-1
)
! row length for bcast
l_colh
=
local_index
(
ncol
,
my_pcol
,
np_cols
,
nblk
,
-1
)
! HV local column number
if
(
my_pcol
==
pcol
(
ncol
,
nblk
,
np_cols
))
hvb
(
nb
+1
:
nb
+
l_rows
)
=
a_mat
(
1
:
l_rows
,
l_colh
)
nb
=
nb
+
l_rows
if
(
lc
==
n_cols
.or.
mod
(
ncol
,
nblk
)
==
0
)
then
#ifdef WITH_MPI
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
MPI_Bcast
(
hvb
(
ns
+1
),
int
(
nb
-
ns
,
kind
=
MPI_KIND
),
MPI_MATH_DATATYPE_PRECISION
,&
int
(
pcol
(
ncol
,
nblk
,
np_cols
),
kind
=
MPI_KIND
),
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
),
mpierr
)
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
MPI_Bcast
(
hvb
(
ns
+1
),
int
(
nb
-
ns
,
kind
=
MPI_KIND
),
MPI_MATH_DATATYPE_PRECISION
,&
int
(
pcol
(
ncol
,
nblk
,
np_cols
),
kind
=
MPI_KIND
),
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
),
mpierr
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#endif /* WITH_MPI */
ns
=
nb
endif
enddo
! lc
! Expand compressed Householder vectors into matrix hvm
nb
=
0
do
lc
=
1
,
n_cols
nrow
=
(
istep
-1
)
*
cwy_blocking
+
lc
! absolute number of pivot row
l_rows
=
local_index
(
nrow
-1
,
my_prow
,
np_rows
,
nblk
,
-1
)
! row length for bcast
hvm
(
1
:
l_rows
,
lc
)
=
hvb
(
nb
+1
:
nb
+
l_rows
)
if
(
my_prow
==
prow
(
nrow
,
nblk
,
np_rows
))
hvm
(
l_rows
+1
,
lc
)
=
1.0_rck
nb
=
nb
+
l_rows
enddo
l_rows
=
local_index
(
MIN
(
na
,(
istep
+1
)
*
cwy_blocking
),
my_prow
,
np_rows
,
nblk
,
-1
)
! compute tmat2 out of tmat(:,:,)
tmat_complete
=
0
do
i
=
1
,
blocking_factor
t_cols
=
MIN
(
nbw
,
n_cols
-
(
i
-1
)
*
nbw
)
if
(
t_cols
<=
0
)
exit
t_rows
=
(
i
-
1
)
*
nbw
tmat_complete
(
t_rows
+1
:
t_rows
+
t_cols
,
t_rows
+1
:
t_rows
+
t_cols
)
=
tmat
(
1
:
t_cols
,
1
:
t_cols
,(
istep
-1
)
*
blocking_factor
+
i
)
if
(
i
>
1
)
then
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMM
(
BLAS_TRANS_OR_CONJ
,
'N'
,
&
int
(
t_rows
,
kind
=
BLAS_KIND
),
int
(
t_cols
,
kind
=
BLAS_KIND
),
int
(
l_rows
,
kind
=
BLAS_KIND
),
ONE
,
hvm
,
&
int
(
max_local_rows
,
kind
=
BLAS_KIND
),
hvm
(:,(
i
-1
)
*
nbw
+1
:),
&
int
(
max_local_rows
,
kind
=
BLAS_KIND
),
ZERO
,
t_tmp
,
int
(
cwy_blocking
,
kind
=
BLAS_KIND
))
call
obj
%
timer
%
stop
(
"blas"
)
ns
=
nb
endif
enddo
! lc
! Expand compressed Householder vectors into matrix hvm
nb
=
0
do
lc
=
1
,
n_cols
nrow
=
(
istep
-1
)
*
cwy_blocking
+
lc
! absolute number of pivot row
l_rows
=
local_index
(
nrow
-1
,
my_prow
,
np_rows
,
nblk
,
-1
)
! row length for bcast
hvm
(
1
:
l_rows
,
lc
)
=
hvb
(
nb
+1
:
nb
+
l_rows
)
if
(
my_prow
==
prow
(
nrow
,
nblk
,
np_rows
))
hvm
(
l_rows
+1
,
lc
)
=
1.0_rck
nb
=
nb
+
l_rows
enddo
l_rows
=
local_index
(
MIN
(
na
,(
istep
+1
)
*
cwy_blocking
),
my_prow
,
np_rows
,
nblk
,
-1
)
! compute tmat2 out of tmat(:,:,)
tmat_complete
=
0
do
i
=
1
,
blocking_factor
t_cols
=
MIN
(
nbw
,
n_cols
-
(
i
-1
)
*
nbw
)
if
(
t_cols
<=
0
)
exit
t_rows
=
(
i
-
1
)
*
nbw
tmat_complete
(
t_rows
+1
:
t_rows
+
t_cols
,
t_rows
+1
:
t_rows
+
t_cols
)
=
tmat
(
1
:
t_cols
,
1
:
t_cols
,(
istep
-1
)
*
blocking_factor
+
i
)
if
(
i
>
1
)
then
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMM
(
BLAS_TRANS_OR_CONJ
,
'N'
,
&
int
(
t_rows
,
kind
=
BLAS_KIND
),
int
(
t_cols
,
kind
=
BLAS_KIND
),
int
(
l_rows
,
kind
=
BLAS_KIND
),
ONE
,
hvm
,
&
int
(
max_local_rows
,
kind
=
BLAS_KIND
),
hvm
(:,(
i
-1
)
*
nbw
+1
:),
&
int
(
max_local_rows
,
kind
=
BLAS_KIND
),
ZERO
,
t_tmp
,
int
(
cwy_blocking
,
kind
=
BLAS_KIND
))
call
obj
%
timer
%
stop
(
"blas"
)
#ifdef WITH_MPI
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_allreduce
(
t_tmp
,
t_tmp2
,
int
(
cwy_blocking
*
nbw
,
kind
=
MPI_KIND
),
MPI_MATH_DATATYPE_PRECISION
,
&
MPI_SUM
,
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
),
mpierr
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_allreduce
(
t_tmp
,
t_tmp2
,
int
(
cwy_blocking
*
nbw
,
kind
=
MPI_KIND
),
MPI_MATH_DATATYPE_PRECISION
,
&
MPI_SUM
,
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
),
mpierr
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_TRMM
(
'L'
,
'U'
,
'N'
,
'N'
,
int
(
t_rows
,
kind
=
BLAS_KIND
),
int
(
t_cols
,
kind
=
BLAS_KIND
),
ONE
,
tmat_complete
,
&
int
(
cwy_blocking
,
kind
=
BLAS_KIND
),
t_tmp2
,
int
(
cwy_blocking
,
kind
=
BLAS_KIND
))
call
PRECISION_TRMM
(
'R'
,
'U'
,
'N'
,
'N'
,
int
(
t_rows
,
kind
=
BLAS_KIND
),
int
(
t_cols
,
kind
=
BLAS_KIND
),
-
ONE
,
&
tmat_complete
(
t_rows
+1
,
t_rows
+1
),
&
int
(
cwy_blocking
,
kind
=
BLAS_KIND
),
t_tmp2
,
int
(
cwy_blocking
,
kind
=
BLAS_KIND
))
call
obj
%
timer
%
stop
(
"blas"
)
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_TRMM
(
'L'
,
'U'
,
'N'
,
'N'
,
int
(
t_rows
,
kind
=
BLAS_KIND
),
int
(
t_cols
,
kind
=
BLAS_KIND
),
ONE
,
tmat_complete
,
&
int
(
cwy_blocking
,
kind
=
BLAS_KIND
),
t_tmp2
,
int
(
cwy_blocking
,
kind
=
BLAS_KIND
))
call
PRECISION_TRMM
(
'R'
,
'U'
,
'N'
,
'N'
,
int
(
t_rows
,
kind
=
BLAS_KIND
),
int
(
t_cols
,
kind
=
BLAS_KIND
),
-
ONE
,
&
tmat_complete
(
t_rows
+1
,
t_rows
+1
),
&
int
(
cwy_blocking
,
kind
=
BLAS_KIND
),
t_tmp2
,
int
(
cwy_blocking
,
kind
=
BLAS_KIND
))
call
obj
%
timer
%
stop
(
"blas"
)
tmat_complete
(
1
:
t_rows
,
t_rows
+1
:
t_rows
+
t_cols
)
=
t_tmp2
(
1
:
t_rows
,
1
:
t_cols
)
tmat_complete
(
1
:
t_rows
,
t_rows
+1
:
t_rows
+
t_cols
)
=
t_tmp2
(
1
:
t_rows
,
1
:
t_cols
)
#else /* WITH_MPI */
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_TRMM
(
'L'
,
'U'
,
'N'
,
'N'
,
int
(
t_rows
,
kind
=
BLAS_KIND
),
int
(
t_cols
,
kind
=
BLAS_KIND
),
ONE
,
tmat_complete
,
&