Commit 65d959e7 authored by Andreas Marek's avatar Andreas Marek
Browse files

Split elpa2_compute_{real|complex}_template in several files

parent e49dd8bf
......@@ -32,6 +32,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/mod_pack_unpack_complex.F90 \
src/aligned_mem.F90 \
src/elpa1_compute_private.F90 \
src/elpa2_determine_workload.F90 \
src/elpa2_compute.F90 \
src/elpa2_kernels/mod_fortran_interfaces.F90 \
src/elpa2_kernels/mod_single_hh_trafo_real.F90 \
......@@ -48,10 +49,19 @@ libelpa@SUFFIX@_private_la_SOURCES = \
EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa_reduce_add_vectors.X90 \
src/elpa_transpose_vectors.X90 \
src/elpa1_compute_complex_template.X90 \
src/elpa1_compute_template.X90 \
src/elpa2_compute_real_template.X90 \
src/elpa2_compute_complex_template.X90 \
src/elpa2_bandred_real_template.X90 \
src/elpa2_symm_matrix_allreduce_real_template.X90 \
src/elpa2_trans_ev_band_to_full_real_template.X90 \
src/elpa2_tridiag_band_real_template.X90 \
src/elpa2_trans_ev_tridi_to_band_real_template.X90 \
src/elpa2_bandred_complex_template.X90 \
src/elpa2_herm_matrix_allreduce_complex_template.X90 \
src/elpa2_trans_ev_band_to_full_complex_template.X90 \
src/elpa2_tridiag_band_complex_template.X90 \
src/elpa2_trans_ev_tridi_to_band_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_real_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
......
This diff is collapsed.
This diff is collapsed.
......@@ -103,7 +103,7 @@ module ELPA2_compute
public :: trans_ev_band_to_full_complex_single
#endif
public :: band_band_real_double
public :: divide_band
! public :: divide_band
integer(kind=ik), public :: which_qr_decomposition = 1 ! defines, which QR-decomposition algorithm will be used
! 0 for unblocked
......
This diff is collapsed.
This diff is collapsed.
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), fomerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
! This particular source code file contains additions, changes and
! enhancements authored by Intel Corporation which is not part of
! the ELPA consortium.
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#include "config-f90.h"
module elpa2_workload
implicit none
private
public :: determine_workload
public :: divide_band
contains
subroutine determine_workload(na, nb, nprocs, limits)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
integer(kind=ik), intent(in) :: na, nb, nprocs
integer(kind=ik), intent(out) :: limits(0:nprocs)
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("determine_workload")
#endif
if (na <= 0) then
limits(:) = 0
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("determine_workload")
#endif
return
endif
if (nb*nprocs > na) then
! there is not enough work for all
do i = 0, nprocs
limits(i) = min(na, i*nb)
enddo
else
do i = 0, nprocs
limits(i) = (i*na)/nprocs
enddo
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("determine_workload")
#endif
end subroutine
!---------------------------------------------------------------------------------------------------
! divide_band: sets the work distribution in band
! Proc n works on blocks block_limits(n)+1 .. block_limits(n+1)
subroutine divide_band(nblocks_total, n_pes, block_limits)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
integer(kind=ik), intent(in) :: nblocks_total ! total number of blocks in band
integer(kind=ik), intent(in) :: n_pes ! number of PEs for division
integer(kind=ik), intent(out) :: block_limits(0:n_pes)
integer(kind=ik) :: n, nblocks, nblocks_left
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("divide_band")
#endif
block_limits(0) = 0
if (nblocks_total < n_pes) then
! Not enough work for all: The first tasks get exactly 1 block
do n=1,n_pes
block_limits(n) = min(nblocks_total,n)
enddo
else
! Enough work for all. If there is no exact loadbalance,
! the LAST tasks get more work since they are finishing earlier!
nblocks = nblocks_total/n_pes
nblocks_left = nblocks_total - n_pes*nblocks
do n=1,n_pes
if (n<=n_pes-nblocks_left) then
block_limits(n) = block_limits(n-1) + nblocks
else
block_limits(n) = block_limits(n-1) + nblocks + 1
endif
enddo
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("divide_band")
#endif
end subroutine
end module elpa2_workload
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine herm_matrix_allreduce_double(n,a,lda,ldb,comm)
#else
subroutine herm_matrix_allreduce_single(n,a,lda,ldb,comm)
#endif
!-------------------------------------------------------------------------------
! herm_matrix_allreduce: Does an mpi_allreduce for a hermitian matrix A.
! On entry, only the upper half of A needs to be set
! On exit, the complete matrix is set
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
integer(kind=ik) :: n, lda, ldb, comm
complex(kind=COMPLEX_DATATYPE) :: a(lda,ldb)
integer(kind=ik) :: i, nc, mpierr
complex(kind=COMPLEX_DATATYPE) :: h1(n*n), h2(n*n)
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("herm_matrix_allreduce_double")
#else
call timer%start("herm_matrix_allreduce_single")
#endif
#endif
nc = 0
do i=1,n
h1(nc+1:nc+i) = a(1:i,i)
nc = nc+i
enddo
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_allreduce(h1, h2, nc, MPI_DOUBLE_COMPLEX, MPI_SUM, comm, mpierr)
#else
call mpi_allreduce(h1, h2, nc, MPI_COMPLEX, MPI_SUM, comm, mpierr)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
nc = 0
do i=1,n
a(1:i,i) = h2(nc+1:nc+i)
a(i,1:i-1) = conjg(a(1:i-1,i))
nc = nc+i
enddo
#else /* WITH_MPI */
! h2(1:nc) = h1(1:nc)
nc = 0
do i=1,n
a(1:i,i) = h1(nc+1:nc+i)
a(i,1:i-1) = conjg(a(1:i-1,i))
nc = nc+i
enddo
#endif /* WITH_MPI */
! nc = 0
! do i=1,n
! a(1:i,i) = h2(nc+1:nc+i)
! a(i,1:i-1) = conjg(a(1:i-1,i))
! nc = nc+i
! enddo
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("herm_matrix_allreduce_double")
#else
call timer%stop("herm_matrix_allreduce_single")
#endif
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
end subroutine herm_matrix_allreduce_double
#else
end subroutine herm_matrix_allreduce_single
#endif
subroutine M_symm_matrix_allreduce_PRECISION(n,a,lda,ldb,comm)
!-------------------------------------------------------------------------------
! symm_matrix_allreduce: Does an mpi_allreduce for a symmetric matrix A.
! On entry, only the upper half of A needs to be set
! On exit, the complete matrix is set
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
implicit none
integer(kind=ik) :: n, lda, ldb, comm
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE) :: a(lda,*)
#else
real(kind=REAL_DATATYPE) :: a(lda,ldb)
#endif
integer(kind=ik) :: i, nc, mpierr
real(kind=REAL_DATATYPE) :: h1(n*n), h2(n*n)
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("symm_matrix_allreduce" // M_PRECISION_SUFFIX)
#endif
nc = 0
do i=1,n
h1(nc+1:nc+i) = a(1:i,i)
nc = nc+i
enddo
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_allreduce(h1, h2, nc, M_MPI_REAL_PRECISION, MPI_SUM, comm, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
nc = 0
do i=1,n
a(1:i,i) = h2(nc+1:nc+i)
a(i,1:i-1) = a(1:i-1,i)
nc = nc+i
enddo
#else /* WITH_MPI */
! h2=h1
nc = 0
do i=1,n
a(1:i,i) = h1(nc+1:nc+i)
a(i,1:i-1) = a(1:i-1,i)
nc = nc+i
enddo
#endif /* WITH_MPI */
! nc = 0
! do i=1,n
! a(1:i,i) = h2(nc+1:nc+i)
! a(i,1:i-1) = a(1:i-1,i)
! nc = nc+i
! enddo
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("symm_matrix_allreduce" // M_PRECISION_SUFFIX)
#endif
end subroutine M_symm_matrix_allreduce_PRECISION
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine trans_ev_band_to_full_complex_double(na, nqc, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, numBlocks, &
mpi_comm_rows, mpi_comm_cols, useGPU)
#else
subroutine trans_ev_band_to_full_complex_single(na, nqc, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, numBlocks, &
mpi_comm_rows, mpi_comm_cols, useGPU)
#endif
!-------------------------------------------------------------------------------
! trans_ev_band_to_full_complex:
! Transforms the eigenvectors of a band matrix back to the eigenvectors of the original matrix
!
! Parameters
!
! na Order of matrix a, number of rows of matrix q
!
! nqc Number of columns of matrix q
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! nbw semi bandwith
!
! a(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a after bandred_complex)
! Distribution is like in Scalapack.
!
! lda Leading dimension of a
! matrixCols local columns of matrix a and q
!
! tmat(nbw,nbw,numBlocks) Factors returned by bandred_complex
!
! q On input: Eigenvectors of band matrix
! On output: Transformed eigenvectors
! Distribution is like in Scalapack.
!
! ldq Leading dimension of q
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
!
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use cuda_functions
use iso_c_binding
use precision
implicit none
logical, intent(in) :: useGPU
integer(kind=ik) :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, &
mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE) :: a(lda,*), q(ldq,*), tmat(nbw,nbw,*)
#else
complex(kind=COMPLEX_DATATYPE) :: a(lda,matrixCols), q(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
complex(kind=COMPLEX_DATATYPE), parameter :: CZERO = (0.0_rk8,0.0_rk8), CONE = (1.0_rk8,0.0_rk8)
#else
complex(kind=COMPLEX_DATATYPE), parameter :: CZERO = (0.0_rk4,0.0_rk4), CONE = (1.0_rk4,0.0_rk4)
#endif
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
integer(kind=ik) :: l_cols, l_rows, l_colh, n_cols
integer(kind=ik) :: istep, lc, ncol, nrow, nb, ns
complex(kind=COMPLEX_DATATYPE), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
integer(kind=ik) :: i
integer(kind=C_intptr_T) :: hvm_dev, q_dev, tmat_dev, tmp_dev
integer(kind=ik) :: istat
character(200) :: errorMessage
logical :: successCUDA
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("trans_ev_band_to_full_complex_double")
#else
call timer%start("trans_ev_band_to_full_complex_single")
#endif
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
max_blocks_row = ((na -1)/nblk)/np_rows + 1 ! Rows of A
max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q!
max_local_rows = max_blocks_row*nblk
max_local_cols = max_blocks_col*nblk
allocate(tmp1(max_local_cols*nbw), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_band_to_full_complex: error when allocating tmp1 "//errorMessage
stop
endif
allocate(tmp2(max_local_cols*nbw), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_band_to_full_complex: error when allocating tmp2 "//errorMessage
stop
endif
allocate(hvb(max_local_rows*nbw), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_band_to_full_complex: error when allocating hvb "//errorMessage
stop
endif
allocate(hvm(max_local_rows,nbw), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_band_to_full_complex: error when allocating hvm "//errorMessage
stop
endif
if (useGPU) then
! allocate(q_temp(ldq,max_local_cols), stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"trans_ev_band_to_full_complex: error when allocating q_temp "//errorMessage
! endif
! allocate(tmat_temp(nbw,nbw), stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"trans_ev_band_to_full_complex: error when allocating tmat_temp "//errorMessage
! endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_malloc(hvm_dev, max_local_rows*nbw*size_of_double_complex_datatype)
#else
successCUDA = cuda_malloc(hvm_dev, max_local_rows*nbw*size_of_single_complex_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMalloc"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_double_complex_datatype)
#else
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_single_complex_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMalloc"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_double_complex_datatype)
#else
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_single_complex_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMalloc"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_malloc(tmp_dev, max_local_cols*nbw*size_of_double_complex_datatype)
#else
successCUDA = cuda_malloc(tmp_dev, max_local_cols*nbw*size_of_single_complex_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMalloc"
stop
endif
!!e istat = cuda_memset(tmp_dev, 0, (max_local_rows)*(nbw)*size_of_complex_datatype)
! istat = cuda_memset(tmp_dev, 0, (max_local_cols)*(nbw)*size_of_complex_datatype)
! if (istat .ne. 0) then
! print *,"trans_ev_band_to_full_complex: error in cudaMalloc"
! stop
! endif
endif
#ifdef DOUBLE_PRECISION_COMPLEX
hvm = 0._ck8 ! Must be set to 0 !!!
hvb = 0._ck8 ! Safety only
#else
hvm = 0._ck4 ! Must be set to 0 !!!
hvb = 0._ck4 ! Safety only
#endif
if (useGPU) then
! q_temp(:,:) = 0.0
! q_temp(1:ldq,1:na_cols) = q(1:ldq,1:na_cols)
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(q_dev, loc(q),ldq*matrixCols*size_of_double_complex_datatype, cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(q_dev, loc(q),ldq*matrixCols*size_of_single_complex_datatype, cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memset(hvm_dev, 0, (max_local_rows)*(nbw)*size_of_double_complex_datatype)
#else
successCUDA = cuda_memset(hvm_dev, 0, (max_local_rows)*(nbw)*size_of_single_complex_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemset"
stop
endif
endif
l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
do istep=1,(na-1)/nbw
n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
! Broadcast all Householder vectors for current step compressed in hvb
nb = 0
ns = 0
do lc = 1, n_cols
ncol = istep*nbw + lc ! absolute column number of householder vector
nrow = ncol - nbw ! absolute number of pivot row
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
l_colh = local_index(ncol , my_pcol, np_cols, nblk, -1) ! HV local column number
if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)
nb = nb+l_rows
if (lc==n_cols .or. mod(ncol,nblk)==0) then
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Bcast(hvb(ns+1), nb-ns, MPI_DOUBLE_COMPLEX, pcol(ncol, nblk, np_cols), mpi_comm_cols, mpierr)
#else
call MPI_Bcast(hvb(ns+1), nb-ns, MPI_COMPLEX, pcol(ncol, nblk, np_cols), mpi_comm_cols, mpierr)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
ns = nb
endif
enddo
! Expand compressed Householder vectors into matrix hvm