Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
elpa
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
11
Issues
11
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Environments
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
elpa
elpa
Commits
65d959e7
Commit
65d959e7
authored
Nov 29, 2016
by
Andreas Marek
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Split elpa2_compute_{real|complex}_template in several files
parent
e49dd8bf
Changes
16
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
11460 additions
and
11215 deletions
+11460
-11215
Makefile.am
Makefile.am
+11
-1
src/elpa2_bandred_complex_template.X90
src/elpa2_bandred_complex_template.X90
+1169
-0
src/elpa2_bandred_real_template.X90
src/elpa2_bandred_real_template.X90
+1215
-0
src/elpa2_compute.F90
src/elpa2_compute.F90
+1
-1
src/elpa2_compute_complex_template.X90
src/elpa2_compute_complex_template.X90
+5
-6080
src/elpa2_compute_real_template.X90
src/elpa2_compute_real_template.X90
+7
-5133
src/elpa2_determine_workload.F90
src/elpa2_determine_workload.F90
+157
-0
src/elpa2_herm_matrix_allreduce_complex_template.X90
src/elpa2_herm_matrix_allreduce_complex_template.X90
+92
-0
src/elpa2_symm_matrix_allreduce_real_template.X90
src/elpa2_symm_matrix_allreduce_real_template.X90
+71
-0
src/elpa2_trans_ev_band_to_full_complex_template.X90
src/elpa2_trans_ev_band_to_full_complex_template.X90
+507
-0
src/elpa2_trans_ev_band_to_full_real_template.X90
src/elpa2_trans_ev_band_to_full_real_template.X90
+585
-0
src/elpa2_trans_ev_tridi_to_band_complex_template.X90
src/elpa2_trans_ev_tridi_to_band_complex_template.X90
+2954
-0
src/elpa2_trans_ev_tridi_to_band_real_template.X90
src/elpa2_trans_ev_tridi_to_band_real_template.X90
+2232
-0
src/elpa2_tridiag_band_complex_template.X90
src/elpa2_tridiag_band_complex_template.X90
+1434
-0
src/elpa2_tridiag_band_real_template.X90
src/elpa2_tridiag_band_real_template.X90
+1019
-0
src/redist_band.X90
src/redist_band.X90
+1
-0
No files found.
Makefile.am
View file @
65d959e7
...
...
@@ -32,6 +32,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/mod_pack_unpack_complex.F90
\
src/aligned_mem.F90
\
src/elpa1_compute_private.F90
\
src/elpa2_determine_workload.F90
\
src/elpa2_compute.F90
\
src/elpa2_kernels/mod_fortran_interfaces.F90
\
src/elpa2_kernels/mod_single_hh_trafo_real.F90
\
...
...
@@ -48,10 +49,19 @@ libelpa@SUFFIX@_private_la_SOURCES = \
EXTRA_libelpa@SUFFIX@
_private_la_DEPENDENCIES
=
\
src/elpa_reduce_add_vectors.X90
\
src/elpa_transpose_vectors.X90
\
src/elpa1_compute_complex_template.X90
\
src/elpa1_compute_template.X90
\
src/elpa2_compute_real_template.X90
\
src/elpa2_compute_complex_template.X90
\
src/elpa2_bandred_real_template.X90
\
src/elpa2_symm_matrix_allreduce_real_template.X90
\
src/elpa2_trans_ev_band_to_full_real_template.X90
\
src/elpa2_tridiag_band_real_template.X90
\
src/elpa2_trans_ev_tridi_to_band_real_template.X90
\
src/elpa2_bandred_complex_template.X90
\
src/elpa2_herm_matrix_allreduce_complex_template.X90
\
src/elpa2_trans_ev_band_to_full_complex_template.X90
\
src/elpa2_tridiag_band_complex_template.X90
\
src/elpa2_trans_ev_tridi_to_band_complex_template.X90
\
src/elpa2_kernels/elpa2_kernels_real_template.X90
\
src/elpa2_kernels/elpa2_kernels_complex_template.X90
\
src/elpa2_kernels/elpa2_kernels_simple_template.X90
\
...
...
src/elpa2_bandred_complex_template.X90
0 → 100644
View file @
65d959e7
This diff is collapsed.
Click to expand it.
src/elpa2_bandred_real_template.X90
0 → 100644
View file @
65d959e7
This diff is collapsed.
Click to expand it.
src/elpa2_compute.F90
View file @
65d959e7
...
...
@@ -103,7 +103,7 @@ module ELPA2_compute
public
::
trans_ev_band_to_full_complex_single
#endif
public
::
band_band_real_double
public
::
divide_band
!
public :: divide_band
integer
(
kind
=
ik
),
public
::
which_qr_decomposition
=
1
! defines, which QR-decomposition algorithm will be used
! 0 for unblocked
...
...
src/elpa2_compute_complex_template.X90
View file @
65d959e7
This diff is collapsed.
Click to expand it.
src/elpa2_compute_real_template.X90
View file @
65d959e7
This diff is collapsed.
Click to expand it.
src/elpa2_determine_workload.F90
0 → 100644
View file @
65d959e7
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), fomerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
! This particular source code file contains additions, changes and
! enhancements authored by Intel Corporation which is not part of
! the ELPA consortium.
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#include "config-f90.h"
module
elpa2_workload
implicit
none
private
public
::
determine_workload
public
::
divide_band
contains
subroutine
determine_workload
(
na
,
nb
,
nprocs
,
limits
)
#ifdef HAVE_DETAILED_TIMINGS
use
timings
#endif
use
precision
implicit
none
integer
(
kind
=
ik
),
intent
(
in
)
::
na
,
nb
,
nprocs
integer
(
kind
=
ik
),
intent
(
out
)
::
limits
(
0
:
nprocs
)
integer
(
kind
=
ik
)
::
i
#ifdef HAVE_DETAILED_TIMINGS
call
timer
%
start
(
"determine_workload"
)
#endif
if
(
na
<=
0
)
then
limits
(:)
=
0
#ifdef HAVE_DETAILED_TIMINGS
call
timer
%
stop
(
"determine_workload"
)
#endif
return
endif
if
(
nb
*
nprocs
>
na
)
then
! there is not enough work for all
do
i
=
0
,
nprocs
limits
(
i
)
=
min
(
na
,
i
*
nb
)
enddo
else
do
i
=
0
,
nprocs
limits
(
i
)
=
(
i
*
na
)/
nprocs
enddo
endif
#ifdef HAVE_DETAILED_TIMINGS
call
timer
%
stop
(
"determine_workload"
)
#endif
end
subroutine
!---------------------------------------------------------------------------------------------------
! divide_band: sets the work distribution in band
! Proc n works on blocks block_limits(n)+1 .. block_limits(n+1)
subroutine
divide_band
(
nblocks_total
,
n_pes
,
block_limits
)
#ifdef HAVE_DETAILED_TIMINGS
use
timings
#endif
use
precision
implicit
none
integer
(
kind
=
ik
),
intent
(
in
)
::
nblocks_total
! total number of blocks in band
integer
(
kind
=
ik
),
intent
(
in
)
::
n_pes
! number of PEs for division
integer
(
kind
=
ik
),
intent
(
out
)
::
block_limits
(
0
:
n_pes
)
integer
(
kind
=
ik
)
::
n
,
nblocks
,
nblocks_left
#ifdef HAVE_DETAILED_TIMINGS
call
timer
%
start
(
"divide_band"
)
#endif
block_limits
(
0
)
=
0
if
(
nblocks_total
<
n_pes
)
then
! Not enough work for all: The first tasks get exactly 1 block
do
n
=
1
,
n_pes
block_limits
(
n
)
=
min
(
nblocks_total
,
n
)
enddo
else
! Enough work for all. If there is no exact loadbalance,
! the LAST tasks get more work since they are finishing earlier!
nblocks
=
nblocks_total
/
n_pes
nblocks_left
=
nblocks_total
-
n_pes
*
nblocks
do
n
=
1
,
n_pes
if
(
n
<=
n_pes
-
nblocks_left
)
then
block_limits
(
n
)
=
block_limits
(
n
-1
)
+
nblocks
else
block_limits
(
n
)
=
block_limits
(
n
-1
)
+
nblocks
+
1
endif
enddo
endif
#ifdef HAVE_DETAILED_TIMINGS
call
timer
%
stop
(
"divide_band"
)
#endif
end
subroutine
end
module
elpa2_workload
src/elpa2_herm_matrix_allreduce_complex_template.X90
0 → 100644
View file @
65d959e7
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine herm_matrix_allreduce_double(n,a,lda,ldb,comm)
#else
subroutine herm_matrix_allreduce_single(n,a,lda,ldb,comm)
#endif
!-------------------------------------------------------------------------------
! herm_matrix_allreduce: Does an mpi_allreduce for a hermitian matrix A.
! On entry, only the upper half of A needs to be set
! On exit, the complete matrix is set
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
integer(kind=ik) :: n, lda, ldb, comm
complex(kind=COMPLEX_DATATYPE) :: a(lda,ldb)
integer(kind=ik) :: i, nc, mpierr
complex(kind=COMPLEX_DATATYPE) :: h1(n*n), h2(n*n)
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("herm_matrix_allreduce_double")
#else
call timer%start("herm_matrix_allreduce_single")
#endif
#endif
nc = 0
do i=1,n
h1(nc+1:nc+i) = a(1:i,i)
nc = nc+i
enddo
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_allreduce(h1, h2, nc, MPI_DOUBLE_COMPLEX, MPI_SUM, comm, mpierr)
#else
call mpi_allreduce(h1, h2, nc, MPI_COMPLEX, MPI_SUM, comm, mpierr)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
nc = 0
do i=1,n
a(1:i,i) = h2(nc+1:nc+i)
a(i,1:i-1) = conjg(a(1:i-1,i))
nc = nc+i
enddo
#else /* WITH_MPI */
! h2(1:nc) = h1(1:nc)
nc = 0
do i=1,n
a(1:i,i) = h1(nc+1:nc+i)
a(i,1:i-1) = conjg(a(1:i-1,i))
nc = nc+i
enddo
#endif /* WITH_MPI */
! nc = 0
! do i=1,n
! a(1:i,i) = h2(nc+1:nc+i)
! a(i,1:i-1) = conjg(a(1:i-1,i))
! nc = nc+i
! enddo
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("herm_matrix_allreduce_double")
#else
call timer%stop("herm_matrix_allreduce_single")
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
end subroutine herm_matrix_allreduce_double
#else
end subroutine herm_matrix_allreduce_single
#endif
src/elpa2_symm_matrix_allreduce_real_template.X90
0 → 100644
View file @
65d959e7
subroutine M_symm_matrix_allreduce_PRECISION(n,a,lda,ldb,comm)
!-------------------------------------------------------------------------------
! symm_matrix_allreduce: Does an mpi_allreduce for a symmetric matrix A.
! On entry, only the upper half of A needs to be set
! On exit, the complete matrix is set
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
integer(kind=ik) :: n, lda, ldb, comm
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE) :: a(lda,*)
#else
real(kind=REAL_DATATYPE) :: a(lda,ldb)
#endif
integer(kind=ik) :: i, nc, mpierr
real(kind=REAL_DATATYPE) :: h1(n*n), h2(n*n)
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("symm_matrix_allreduce" // M_PRECISION_SUFFIX)
#endif
nc = 0
do i=1,n
h1(nc+1:nc+i) = a(1:i,i)
nc = nc+i
enddo
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_allreduce(h1, h2, nc, M_MPI_REAL_PRECISION, MPI_SUM, comm, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
nc = 0
do i=1,n
a(1:i,i) = h2(nc+1:nc+i)
a(i,1:i-1) = a(1:i-1,i)
nc = nc+i
enddo
#else /* WITH_MPI */
! h2=h1
nc = 0
do i=1,n
a(1:i,i) = h1(nc+1:nc+i)
a(i,1:i-1) = a(1:i-1,i)
nc = nc+i
enddo
#endif /* WITH_MPI */
! nc = 0
! do i=1,n
! a(1:i,i) = h2(nc+1:nc+i)
! a(i,1:i-1) = a(1:i-1,i)
! nc = nc+i
! enddo
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("symm_matrix_allreduce" // M_PRECISION_SUFFIX)
#endif
end subroutine M_symm_matrix_allreduce_PRECISION
src/elpa2_trans_ev_band_to_full_complex_template.X90
0 → 100644
View file @
65d959e7
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine trans_ev_band_to_full_complex_double(na, nqc, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, numBlocks, &
mpi_comm_rows, mpi_comm_cols, useGPU)
#else
subroutine trans_ev_band_to_full_complex_single(na, nqc, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, numBlocks, &
mpi_comm_rows, mpi_comm_cols, useGPU)
#endif
!-------------------------------------------------------------------------------
! trans_ev_band_to_full_complex:
! Transforms the eigenvectors of a band matrix back to the eigenvectors of the original matrix
!
! Parameters
!
! na Order of matrix a, number of rows of matrix q
!
! nqc Number of columns of matrix q
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! nbw semi bandwith
!
! a(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a after bandred_complex)
! Distribution is like in Scalapack.
!
! lda Leading dimension of a
! matrixCols local columns of matrix a and q
!
! tmat(nbw,nbw,numBlocks) Factors returned by bandred_complex
!
! q On input: Eigenvectors of band matrix
! On output: Transformed eigenvectors
! Distribution is like in Scalapack.
!
! ldq Leading dimension of q
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
!
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use cuda_functions
use iso_c_binding
use precision
implicit none
logical, intent(in) :: useGPU
integer(kind=ik) :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, &
mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE) :: a(lda,*), q(ldq,*), tmat(nbw,nbw,*)
#else
complex(kind=COMPLEX_DATATYPE) :: a(lda,matrixCols), q(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
complex(kind=COMPLEX_DATATYPE), parameter :: CZERO = (0.0_rk8,0.0_rk8), CONE = (1.0_rk8,0.0_rk8)
#else
complex(kind=COMPLEX_DATATYPE), parameter :: CZERO = (0.0_rk4,0.0_rk4), CONE = (1.0_rk4,0.0_rk4)
#endif
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
integer(kind=ik) :: l_cols, l_rows, l_colh, n_cols
integer(kind=ik) :: istep, lc, ncol, nrow, nb, ns
complex(kind=COMPLEX_DATATYPE), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
integer(kind=ik) :: i
integer(kind=C_intptr_T) :: hvm_dev, q_dev, tmat_dev, tmp_dev
integer(kind=ik) :: istat
character(200) :: errorMessage
logical :: successCUDA
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("trans_ev_band_to_full_complex_double")
#else
call timer%start("trans_ev_band_to_full_complex_single")
#endif
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
max_blocks_row = ((na -1)/nblk)/np_rows + 1 ! Rows of A
max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q!
max_local_rows = max_blocks_row*nblk
max_local_cols = max_blocks_col*nblk
allocate(tmp1(max_local_cols*nbw), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_band_to_full_complex: error when allocating tmp1 "//errorMessage
stop
endif
allocate(tmp2(max_local_cols*nbw), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_band_to_full_complex: error when allocating tmp2 "//errorMessage
stop
endif
allocate(hvb(max_local_rows*nbw), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_band_to_full_complex: error when allocating hvb "//errorMessage
stop
endif
allocate(hvm(max_local_rows,nbw), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_band_to_full_complex: error when allocating hvm "//errorMessage
stop
endif
if (useGPU) then
! allocate(q_temp(ldq,max_local_cols), stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"trans_ev_band_to_full_complex: error when allocating q_temp "//errorMessage
! endif
! allocate(tmat_temp(nbw,nbw), stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"trans_ev_band_to_full_complex: error when allocating tmat_temp "//errorMessage
! endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_malloc(hvm_dev, max_local_rows*nbw*size_of_double_complex_datatype)
#else
successCUDA = cuda_malloc(hvm_dev, max_local_rows*nbw*size_of_single_complex_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMalloc"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_double_complex_datatype)
#else
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_single_complex_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMalloc"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_double_complex_datatype)
#else
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_single_complex_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMalloc"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_malloc(tmp_dev, max_local_cols*nbw*size_of_double_complex_datatype)
#else
successCUDA = cuda_malloc(tmp_dev, max_local_cols*nbw*size_of_single_complex_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMalloc"
stop
endif
!!e istat = cuda_memset(tmp_dev, 0, (max_local_rows)*(nbw)*size_of_complex_datatype)
! istat = cuda_memset(tmp_dev, 0, (max_local_cols)*(nbw)*size_of_complex_datatype)
! if (istat .ne. 0) then
! print *,"trans_ev_band_to_full_complex: error in cudaMalloc"
! stop
! endif
endif
#ifdef DOUBLE_PRECISION_COMPLEX
hvm = 0._ck8 ! Must be set to 0 !!!
hvb = 0._ck8 ! Safety only
#else
hvm = 0._ck4 ! Must be set to 0 !!!
hvb = 0._ck4 ! Safety only
#endif
if (useGPU) then
! q_temp(:,:) = 0.0
! q_temp(1:ldq,1:na_cols) = q(1:ldq,1:na_cols)
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(q_dev, loc(q),ldq*matrixCols*size_of_double_complex_datatype, cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(q_dev, loc(q),ldq*matrixCols*size_of_single_complex_datatype, cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memset(hvm_dev, 0, (max_local_rows)*(nbw)*size_of_double_complex_datatype)
#else
successCUDA = cuda_memset(hvm_dev, 0, (max_local_rows)*(nbw)*size_of_single_complex_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemset"
stop
endif
endif
l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
do istep=1,(na-1)/nbw
n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
! Broadcast all Householder vectors for current step compressed in hvb
nb = 0
ns = 0
do lc = 1, n_cols
ncol = istep*nbw + lc ! absolute column number of householder vector
nrow = ncol - nbw ! absolute number of pivot row
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
l_colh = local_index(ncol , my_pcol, np_cols, nblk, -1) ! HV local column number
if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)
nb = nb+l_rows
if (lc==n_cols .or. mod(ncol,nblk)==0) then
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Bcast(hvb(ns+1), nb-ns, MPI_DOUBLE_COMPLEX, pcol(ncol, nblk, np_cols), mpi_comm_cols, mpierr)
#else
call MPI_Bcast(hvb(ns+1), nb-ns, MPI_COMPLEX, pcol(ncol, nblk, np_cols), mpi_comm_cols, mpierr)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
ns = nb
endif
enddo
! Expand compressed Householder vectors into matrix hvm
nb = 0
do lc = 1, n_cols
nrow = (istep-1)*nbw+lc ! absolute number of pivot row
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = 1.
nb = nb+l_rows
enddo
if (useGPU) then
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(hvm_dev,loc(hvm),(max_local_rows*nbw*size_of_double_complex_datatype),cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(hvm_dev,loc(hvm),(max_local_rows*nbw*size_of_single_complex_datatype),cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop