Commit 7a51ebd1 authored by Carolin Penke's avatar Carolin Penke
Browse files

Changes for skew-symmetric matrices introduced as comments

parent 17f5d6b1
......@@ -885,6 +885,7 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa_c_interface.c \
@top_srcdir@/src/elpa2/mod_pack_unpack_cpu.F90 \
@top_srcdir@/src/elpa2/elpa2_symm_matrix_allreduce_real_template.F90 \
@top_srcdir@/src/elpa2/elpa2_ssymm_matrix_allreduce_real_template.F90 \
@top_srcdir@/src/elpa2/elpa2_tridiag_band_template.F90 \
@top_srcdir@/src/elpa2/mod_redist_band.F90 \
@top_srcdir@/src/elpa2/pack_unpack_cpu.F90 \
......@@ -1002,6 +1003,7 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa1/elpa_solve_tridi_impl_public.F90 \
@top_srcdir@/src/elpa1/elpa1_trans_ev_template.F90 \
@top_srcdir@/src/elpa1/elpa_transpose_vectors.F90 \
@top_srcdir@/src/elpa1/elpa_transpose_vectors_ss.F90 \
@top_srcdir@/src/elpa1/elpa1_auxiliary.F90 \
@top_srcdir@/src/elpa1/elpa1_tridiag_template.F90 \
@top_srcdir@/src/elpa1/elpa1_tools_template.F90 \
......
......@@ -4,6 +4,7 @@ import sys
simple_tokens = [
"PRECISION",
"elpa_transpose_vectors_NUMBER_PRECISION",
"elpa_transpose_vectors_ssNUMBER_PRECISION",
"elpa_reduce_add_vectors_NUMBER_PRECISION",
"bandred_NUMBER_PRECISION",
"trans_ev_band_to_full_NUMBER_PRECISION",
......@@ -19,6 +20,7 @@ simple_tokens = [
"qr_pdgeqrf_2dcomm_PRECISION",
"hh_transform_NUMBER_PRECISION",
"symm_matrix_allreduce_PRECISION",
"ssymm_matrix_allreduce_PRECISION",
"herm_matrix_allreduce_PRECISION",
"redist_band_NUMBER_PRECISION",
"unpack_row_NUMBER_cpu_PRECISION",
......
#if 0
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
! Author: Andreas Marek, MPCDF
#endif
#include "config-f90.h"
#include "../general/sanity.F90"
subroutine elpa_transpose_vectors_ss_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, vmat_s, ld_s, comm_s, vmat_t, ld_t, comm_t, nvs, nvr, nvc, nblk, nrThreads)
!-------------------------------------------------------------------------------
! This routine transposes an array of vectors which are distributed in
! communicator comm_s into its transposed form distributed in communicator comm_t.
! There must be an identical copy of vmat_s in every communicator comm_s.
! After this routine, there is an identical copy of vmat_t in every communicator comm_t.
!
! vmat_s original array of vectors
! ld_s leading dimension of vmat_s
! comm_s communicator over which vmat_s is distributed
! vmat_t array of vectors in transposed form
! ld_t leading dimension of vmat_t
! comm_t communicator over which vmat_t is distributed
! nvs global index where to start in vmat_s/vmat_t
! Please note: this is kind of a hint, some values before nvs will be
! accessed in vmat_s/put into vmat_t
! nvr global length of vmat_s/vmat_t
! nvc number of columns in vmat_s/vmat_t
! nblk block size of block cyclic distribution
!
!-------------------------------------------------------------------------------
use precision
use elpa_abstract_impl
#ifdef WITH_OPENMP
use omp_lib
#endif
use elpa_mpi
implicit none
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik), intent(in) :: ld_s, comm_s, ld_t, comm_t, nvs, nvr, nvc, nblk
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(in) :: vmat_s(ld_s,nvc)
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout):: vmat_t(ld_t,nvc)
MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: aux(:)
integer(kind=ik) :: myps, mypt, nps, npt
integer(kind=ik) :: n, lc, k, i, ips, ipt, ns, nl, mpierr
integer(kind=ik) :: lcm_s_t, nblks_tot, nblks_comm, nblks_skip
integer(kind=ik) :: auxstride
integer(kind=ik), intent(in) :: nrThreads
call obj%timer%start("elpa_transpose_vectors_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
)
call obj%timer%start("mpi_communication")
call mpi_comm_rank(comm_s,myps,mpierr)
call mpi_comm_size(comm_s,nps ,mpierr)
call mpi_comm_rank(comm_t,mypt,mpierr)
call mpi_comm_size(comm_t,npt ,mpierr)
call obj%timer%stop("mpi_communication")
! The basic idea of this routine is that for every block (in the block cyclic
! distribution), the processor within comm_t which owns the diagonal
! broadcasts its values of vmat_s to all processors within comm_t.
! Of course this has not to be done for every block separately, since
! the communictation pattern repeats in the global matrix after
! the least common multiple of (nps,npt) blocks
lcm_s_t = least_common_multiple(nps,npt) ! least common multiple of nps, npt
nblks_tot = (nvr+nblk-1)/nblk ! number of blocks corresponding to nvr
! Get the number of blocks to be skipped at the begin.
! This must be a multiple of lcm_s_t (else it is getting complicated),
! thus some elements before nvs will be accessed/set.
nblks_skip = ((nvs-1)/(nblk*lcm_s_t))*lcm_s_t
allocate(aux( ((nblks_tot-nblks_skip+lcm_s_t-1)/lcm_s_t) * nblk * nvc ))
#ifdef WITH_OPENMP
!$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n)
#endif
do n = 0, lcm_s_t-1
ips = mod(n,nps)
ipt = mod(n,npt)
if(mypt == ipt) then
nblks_comm = (nblks_tot-nblks_skip-n+lcm_s_t-1)/lcm_s_t
auxstride = nblk * nblks_comm
! if(nblks_comm==0) cycle
if (nblks_comm .ne. 0) then
if(myps == ips) then
! k = 0
#ifdef WITH_OPENMP
!$omp do
#endif
do lc=1,nvc
do i = nblks_skip+n, nblks_tot-1, lcm_s_t
k = (i - nblks_skip - n)/lcm_s_t * nblk + (lc - 1) * auxstride
ns = (i/nps)*nblk ! local start of block i
nl = min(nvr-i*nblk,nblk) ! length
aux(k+1:k+nl) = vmat_s(ns+1:ns+nl,lc)
! k = k+nblk
enddo
enddo
endif
#ifdef WITH_OPENMP
!$omp barrier
!$omp master
#endif
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call MPI_Bcast(aux, nblks_comm*nblk*nvc, &
#if REALCASE == 1
MPI_REAL_PRECISION, &
#endif
#if COMPLEXCASE == 1
MPI_COMPLEX_PRECISION, &
#endif
ips, comm_s, mpierr)
call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
#ifdef WITH_OPENMP
!$omp end master
!$omp barrier
!$omp do
#endif
! k = 0
do lc=1,nvc
do i = nblks_skip+n, nblks_tot-1, lcm_s_t
k = (i - nblks_skip - n)/lcm_s_t * nblk + (lc - 1) * auxstride
ns = (i/npt)*nblk ! local start of block i
nl = min(nvr-i*nblk,nblk) ! length
vmat_t(ns+1:ns+nl,lc) = - aux(k+1:k+nl)
! k = k+nblk
enddo
enddo
endif
endif
enddo
#ifdef WITH_OPENMP
!$omp end parallel
#endif
deallocate(aux)
call obj%timer%stop("elpa_transpose_vectors_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
)
end subroutine
......@@ -1157,7 +1157,12 @@
lre = min(l_rows,i*l_rows_tile)
call obj%timer%start("blas")
call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a_mat(1,lcs), lda, &
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE, &
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), &
#if SKEWSYMMETRIC == 1
-ONE, &
#else
ONE, &
#endif
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
call obj%timer%stop("blas")
endif ! useGPU
......@@ -1359,8 +1364,12 @@
endif ! useGPU
#if REALCASE == 1
#if SKEWSYMMETRIC == 1
call ssymm_matrix_allreduce_&
#else
call symm_matrix_allreduce_&
#endif
#endif
#if COMPLEXCASE == 1
call herm_matrix_allreduce_&
#endif
......@@ -1407,7 +1416,11 @@
endif
! Transpose umc -> umr (stored in vmr, second half)
#if SKEWSYMMETRIC == 1
call elpa_transpose_vectors_ss&
#else
call elpa_transpose_vectors_&
#endif
&MATH_DATATYPE&
&_&
&PRECISION &
......@@ -1438,6 +1451,9 @@
call obj%timer%start("blas")
call PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols, &
#if REALCASE == 1
#if SKEWSYMMETRIC == 1
0.5_rk, &
#endif
-0.5_rk, &
#endif
#if COMPLEXCASE == 1
......@@ -1448,7 +1464,11 @@
call obj%timer%stop("blas")
! Transpose umc -> umr (stored in vmr, second half)
#if SKEWSYMMETRIC == 1
call elpa_transpose_vectors_ss_&
#else
call elpa_transpose_vectors_&
#endif
&MATH_DATATYPE&
&_&
&PRECISION &
......
......@@ -60,6 +60,9 @@
#include "elpa2_bandred_template.F90"
#define REALCASE 1
#include "elpa2_symm_matrix_allreduce_real_template.F90"
#if SKEWSYMMETRIC == 1
#include "elpa2_ssymm_matrix_allreduce_real_template.F90"
#endif
#undef REALCASE
#define REALCASE 1
#include "elpa2_trans_ev_band_to_full_template.F90"
......
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
! This particular source code file contains additions, changes and
! enhancements authored by Intel Corporation which is not part of
! the ELPA consortium.
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#include "../general/sanity.F90"
subroutine ssymm_matrix_allreduce_&
&PRECISION &
(obj, n, a, lda, ldb, comm)
!-------------------------------------------------------------------------------
! symm_matrix_allreduce: Does an mpi_allreduce for a symmetric matrix A.
! On entry, only the upper half of A needs to be set
! On exit, the complete matrix is set
!-------------------------------------------------------------------------------
use elpa_abstract_impl
use precision
implicit none
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik) :: n, lda, ldb, comm
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE) :: a(lda,*)
#else
real(kind=REAL_DATATYPE) :: a(lda,ldb)
#endif
integer(kind=ik) :: i, nc, mpierr
real(kind=REAL_DATATYPE) :: h1(n*n), h2(n*n)
call obj%timer%start("symm_matrix_allreduce" // PRECISION_SUFFIX)
nc = 0
do i=1,n
h1(nc+1:nc+i) = a(1:i,i)
nc = nc+i
enddo
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_allreduce(h1, h2, nc, MPI_REAL_PRECISION, MPI_SUM, comm, mpierr)
call obj%timer%stop("mpi_communication")
nc = 0
do i=1,n
a(1:i,i) = h2(nc+1:nc+i)
a(i,1:i-1) = - a(1:i-1,i)
nc = nc+i
enddo
#else /* WITH_MPI */
! h2=h1
nc = 0
do i=1,n
a(1:i,i) = h1(nc+1:nc+i)
a(i,1:i-1) = - a(1:i-1,i)
nc = nc+i
enddo
#endif /* WITH_MPI */
! nc = 0
! do i=1,n
! a(1:i,i) = h2(nc+1:nc+i)
! a(i,1:i-1) = a(1:i-1,i)
! nc = nc+i
! enddo
call obj%timer%stop("symm_matrix_allreduce" // PRECISION_SUFFIX)
end subroutine symm_matrix_allreduce_&
&PRECISION
......@@ -82,7 +82,11 @@
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,*)
#else
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,obj%local_ncols)
#ifdef SKEWSYMMETRIC == 1
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,2*obj%local_ncols)
#else
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,obj%local_ncols)
#endif
#endif
real(kind=C_DATATYPE_KIND), intent(inout) :: ev(obj%na)
MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: hh_trans(:,:)
......@@ -664,10 +668,35 @@
stop 1
endif
#endif
#if SKEWSYMMETRIC == 1
! Extra transformation step for skew-symmetric matrix. Multiplication with diagonal complex matrix D.
! This makes the eigenvectors complex.
! For now real part of eigenvectors is generated in first half of q, imaginary part in second part.
q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols) = 0.0
do i = 1, obj%local_nrows
! global_index = indxl2g(i, nblk, my_prow, 0, np_rows)
global_index = np_rows*nblk*((i-1)/nblk) + MOD(i-1,nblk) + MOD(np_rows+my_prow-0, np_rows)*nblk + 1
if (mod(global_index-1,4) .eq. 0) then
! do nothing
end if
if (mod(global_index-1,4) .eq. 1) then
q(i,obj%local_ncols+1:2*obj%local_ncols) = q(i,1:obj%local_ncols)
q(i,1:obj%local_ncols) = 0
end if
if (mod(global_index-1,4) .eq. 2) then
q(i,1:obj%local_ncols) = -q(i,1:obj%local_ncols)
end if
if (mod(global_index-1,4) .eq. 3) then
q(i,obj%local_ncols+1:2*obj%local_ncols) = -q(i,1:obj%local_ncols)
q(i,1:obj%local_ncols) = 0
end if
end do
#endif
! Backtransform stage 1
call obj%timer%start("trans_ev_to_band")
! In the skew-symmetric case this transforms the real part
call trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&_&
......@@ -676,10 +705,22 @@
q_dev, &
ldq, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU_trans_ev_tridi_to_band, &
nrThreads, success=success, kernel=kernel)
#if SKEWSYMMETRIC == 1
! Transform imaginary part
! Transformation of real and imaginary part could also be one call of trans_ev_tridi acting on the n x 2n matrix.
call trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, na, nev, nblk, nbw, q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols), &
q_dev, &
ldq, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU_trans_ev_tridi_to_band, &
nrThreads, success=success, kernel=kernel)
#endif
call obj%timer%stop("trans_ev_to_band")
if (.not.(success)) return
! We can now deallocate the stored householder vectors
deallocate(hh_trans, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......@@ -731,7 +772,7 @@
endif
! Backtransform stage 2
! In the skew-symemtric case this transforms the real part
call trans_ev_band_to_full_&
&MATH_DATATYPE&
&_&
......@@ -744,7 +785,22 @@
, useQRActual &
#endif
)
#if SKEWSYMMETRIC == 1
! Transform imaginary part
! Transformation of real and imaginary part could also be one call of trans_ev_band_to_full_ acting on the n x 2n matrix.
call trans_ev_band_to_full_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, na, nev, nblk, nbw, a, &
a_dev, lda, tmat, tmat_dev, q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols), &
q_dev, &
ldq, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev_band_to_full &
#if REALCASE == 1
, useQRActual &
#endif
)
#endif
deallocate(tmat, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......
......@@ -486,7 +486,11 @@
#endif
#if REALCASE == 1
#if SKEWSYMMETRIC == 1
d(istep) = 0
#else
d(istep) = ab(1,na_s-n_off)
#endif
e(istep) = ab(2,na_s-n_off)
#endif
#if COMPLEXCASE == 1
......@@ -496,8 +500,12 @@
if (istep == na-1) then
#if REALCASE == 1
#if SKEWSYMMETRIC == 1
d(na) = 0
#else
d(na) = ab(1,na_s+1-n_off)
#endif
#endif
#if COMPLEXCASE == 1
d(na) = real(ab(1,na_s+1-n_off),kind=rk)
#endif
......@@ -855,8 +863,13 @@
if (wantDebug) call obj%timer%start("blas")
#if REALCASE == 1
#if SKEWSYMMETRIC == 1
! call PRECISION_SSMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
call dssmv('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
#else
call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
#endif
#endif
#if COMPLEXCASE == 1
call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd,1)
#endif
......@@ -890,8 +903,13 @@
! Normal matrix multiply
if (wantDebug) call obj%timer%start("blas")
#if REALCASE == 1
#if SKEWSYMMETRIC == 1
! call PRECISION_SSMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
call dssmv('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
#else
call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
#endif
#endif
#if COMPLEXCASE == 1
call PRECISION_HEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, ZERO, hd, 1)
#endif
......@@ -961,19 +979,29 @@
! Transform diagonal block
#if REALCASE == 1
#if SKEWSYMMETRIC == 1