Commit d60e24e9 authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'master' of gitta:amarek/ELPA into ELPA_GPU

parents ab3f568f ccc7d1b9
......@@ -11,6 +11,11 @@ If you want to build (or have to since no packages are available) ELPA yourself,
please note that ELPA is shipped with a typical "configure" and "make"
procedure. This is the only supported way how to build and install ELPA.
If you obtained ELPA from the official git repository, you will not find
the needed configure script! Please look at the "INSTALL_FROM_GIT_VERSION" file
for the documentation how to proceed.
If --- against our recommendations --- you do not want to install ELPA as
library, but to include it in your source code you will have to find a solution
by yourself. If you do this anyway, please distribute then all files of ELPA
......
Welcome to the git-based distribution of the ELPA eigensolver library.
If you are reading this file, you have obtained the ELPA library
through the git repository that hosts the source code and also allows
you to contribute improvements to the project if necessary.
The git version does not contain the necessary build script:
configure, Makefile ...
If you use the git version, you are most likely actively developing
features/improvements for ELPA, and a rebuild of the autotools scripts
will be necessary anyway.
Thus please run "autogen.sh" after your changes, in order to build the
autotoos scripts. Note that autoconf version >= 2.69 is needed for
ELPA.
After this step, please proceed as written in the "INSTALL" file.
Welcome to the git-based distribution of the ELPA eigensolver library.
If you are reading this file, you have obtained the ELPA library
through the git repository that hosts the source code and also allows
you to contribute improvements to the project if necessary.
If you obtained ELPA via the official git repository please have
a look at the "INSTALL_FROM_GIT_VERSION" for specific instructions
In your use of ELPA, please respect the copyright restrictions
found below and in the "COPYING" directory in this repository. In a
......
AC_PREREQ([2.69])
AC_INIT([elpa],[2015.02.002], [elpa-library@rzg.mpg.de])
AC_INIT([elpa],[2015.05.001], [elpa-library@rzg.mpg.de])
AC_SUBST([PACKAGE_VERSION])
AC_CONFIG_SRCDIR([src/elpa1.F90])
......@@ -34,7 +34,7 @@ rm -rf config.h config-f90.h
# by the current interface, as they are ABI compatible (e.g. only new symbols
# were added by the new interface)
#
AC_SUBST([ELPA_SO_VERSION], [3:0:0])
AC_SUBST([ELPA_SO_VERSION], [3:1:0])
#
......@@ -118,6 +118,7 @@ fi
dnl variables needed for the tests
dnl do NOT remove any variables here, until
dnl 1. you know 100% what you are doing
dnl 2. you tested ALL configure functionality afterwards
......@@ -126,6 +127,7 @@ dnl Otherwise, you most likely break some functionality
dnl as default always define the generic kernels to be build
dnl this is only unset if gpu_support_only is defined, or
dnl other specific real/complex kernels are wanted
install_real_generic=yes
install_real_generic_simple=yes
......@@ -424,8 +426,6 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([
)
AC_MSG_RESULT([${fortran_can_check_environment}])
dnl check whether GPU version is requested
#CUDA_INSTALL_PATH="/usr/local/cuda/"
......@@ -536,7 +536,6 @@ dnl GPU version only
m4_include([m4/ax_elpa_gpu_version_only.m4])
DEFINE_OPTION_GPU_SUPPORT_ONLY([gpu-version-only],[gpu-support],[install_gpu])
dnl last check whether user wants to compile only a specific kernel
dnl
m4_include([m4/ax_elpa_specific_kernels.m4])
......@@ -603,6 +602,7 @@ dnl set the conditionals according to the previous tests
if test x"${can_use_iso_fortran_env}" = x"yes" ; then
AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env])
fi
AM_CONDITIONAL([WITH_GPU_VERSION],[test x"$install_gpu" = x"yes"])
if test x"${install_gpu}" = x"yes" ; then
AC_DEFINE([WITH_GPU_VERSION],[1],[enable GPU support])
......
......@@ -58,8 +58,7 @@ m4_copy([_AX_ELPA_LANG_OPENMP(Fortran 77)], [_AX_ELPA_LANG_OPENMP(Fortran)])
AC_DEFUN([AX_ELPA_OPENMP],
[
OPENMP_[]_AC_LANG_PREFIX[]FLAGS=
AC_ARG_ENABLE([openmp],
[AS_HELP_STRING([--disable-openmp], [do not use OpenMP])])
enable_openmp="yes"
if test "$enable_openmp" != no; then
AC_CACHE_CHECK([for _AC_LANG_ABBREV option to support OpenMP],
[ac_cv_prog_[]_AC_LANG_ABBREV[]_openmp],
......
......@@ -52,9 +52,6 @@ module ELPA1
use elpa_utilities
#ifdef HAVE_ISO_FORTRAN_ENV
use iso_fortran_env, only : error_unit
#endif
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
......@@ -91,9 +88,8 @@ module ELPA1
public :: hh_transform_real
public :: hh_transform_complex
#ifndef HAVE_ISO_FORTRAN_ENV
integer, parameter :: error_unit = 6
#endif
public :: elpa_reduce_add_vectors_complex, elpa_reduce_add_vectors_real
public :: elpa_transpose_vectors_complex, elpa_transpose_vectors_real
!-------------------------------------------------------------------------------
......@@ -156,7 +152,7 @@ end function get_elpa_row_col_comms
!-------------------------------------------------------------------------------
function solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols) result(success)
function solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) result(success)
!-------------------------------------------------------------------------------
! solve_evp_real: Solves the real eigenvalue problem
......@@ -168,7 +164,7 @@ function solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_co
! nev Number of eigenvalues needed.
! The smallest nev eigenvalues/eigenvectors are calculated.
!
! a(lda,*) Distributed matrix for which eigenvalues are to be computed.
! a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed.
! Distribution is like in Scalapack.
! The full matrix must be set (not only one half like in scalapack).
! Destroyed on exit (upper and lower half).
......@@ -177,7 +173,7 @@ function solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_co
!
! ev(na) On output: eigenvalues of a, every processor gets the complete set
!
! q(ldq,*) On output: Eigenvectors of a
! q(ldq,matrixCols) On output: Eigenvectors of a
! Distribution is like in Scalapack.
! Must be always dimensioned to the full size (corresponding to (na,na))
! even if only a part of the eigenvalues is needed.
......@@ -196,8 +192,8 @@ function solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_co
#endif
implicit none
integer, intent(in) :: na, nev, lda, ldq, nblk, mpi_comm_rows, mpi_comm_cols
real*8 :: a(lda,*), ev(na), q(ldq,*)
integer, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real*8 :: a(lda,matrixCols), ev(na), q(ldq,matrixCols)
integer :: my_prow, my_pcol, mpierr
real*8, allocatable :: e(:), tau(:)
......@@ -225,13 +221,13 @@ function solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_co
allocate(e(na), tau(na))
ttt0 = MPI_Wtime()
call tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
call tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0
time_evp_fwd = ttt1-ttt0
ttt0 = MPI_Wtime()
call solve_tridi(na, nev, ev, e, q, ldq, nblk, mpi_comm_rows, &
call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, wantDebug, success)
if (.not.(success)) return
......@@ -240,7 +236,7 @@ function solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_co
time_evp_solve = ttt1-ttt0
ttt0 = MPI_Wtime()
call trans_ev_real(na, nev, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols)
call trans_ev_real(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_real:',ttt1-ttt0
time_evp_back = ttt1-ttt0
......@@ -256,7 +252,7 @@ end function solve_evp_real
!-------------------------------------------------------------------------------
function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols) result(success)
function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) result(success)
!-------------------------------------------------------------------------------
! solve_evp_complex: Solves the complex eigenvalue problem
......@@ -268,7 +264,7 @@ function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi
! nev Number of eigenvalues needed
! The smallest nev eigenvalues/eigenvectors are calculated.
!
! a(lda,*) Distributed matrix for which eigenvalues are to be computed.
! a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed.
! Distribution is like in Scalapack.
! The full matrix must be set (not only one half like in scalapack).
! Destroyed on exit (upper and lower half).
......@@ -277,7 +273,7 @@ function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi
!
! ev(na) On output: eigenvalues of a, every processor gets the complete set
!
! q(ldq,*) On output: Eigenvectors of a
! q(ldq,matrixCols) On output: Eigenvectors of a
! Distribution is like in Scalapack.
! Must be always dimensioned to the full size (corresponding to (na,na))
! even if only a part of the eigenvalues is needed.
......@@ -297,8 +293,8 @@ function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi
implicit none
integer, intent(in) :: na, nev, lda, ldq, nblk, mpi_comm_rows, mpi_comm_cols
complex*16 :: a(lda,*), q(ldq,*)
integer, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
complex*16 :: a(lda,matrixCols), q(ldq,matrixCols)
real*8 :: ev(na)
integer :: my_prow, my_pcol, np_rows, np_cols, mpierr
......@@ -339,13 +335,13 @@ function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi
allocate(q_real(l_rows,l_cols))
ttt0 = MPI_Wtime()
call tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
call tridiag_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_complex :',ttt1-ttt0
time_evp_fwd = ttt1-ttt0
ttt0 = MPI_Wtime()
call solve_tridi(na, nev, ev, e, q_real, l_rows, nblk, mpi_comm_rows, &
call solve_tridi(na, nev, ev, e, q_real, l_rows, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, wantDebug, success)
if (.not.(success)) return
......@@ -356,7 +352,7 @@ function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi
ttt0 = MPI_Wtime()
q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
call trans_ev_complex(na, nev, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols)
call trans_ev_complex(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_complex:',ttt1-ttt0
time_evp_back = ttt1-ttt0
......@@ -369,9 +365,19 @@ function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi
end function solve_evp_complex
#define DATATYPE REAL
#define BYTESIZE 8
#define REALCASE 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef REALCASE
!-------------------------------------------------------------------------------
subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, tau)
subroutine tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau)
!-------------------------------------------------------------------------------
! tridiag_real: Reduces a distributed symmetric matrix to tridiagonal form
......@@ -381,12 +387,13 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
!
! na Order of matrix
!
! a(lda,*) Distributed matrix which should be reduced.
! a(lda,matrixCols) Distributed matrix which should be reduced.
! Distribution is like in Scalapack.
! Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
! a(:,:) is overwritten on exit with the Householder vectors
!
! lda Leading dimension of a
! matrixCols local columns of matrix
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
......@@ -406,8 +413,8 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
#endif
implicit none
integer na, lda, nblk, mpi_comm_rows, mpi_comm_cols
real*8 a(lda,*), d(na), e(na), tau(na)
integer na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real*8 a(lda,matrixCols), d(na), e(na), tau(na)
integer, parameter :: max_stored_rows = 32
......@@ -504,8 +511,8 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
vr(1:l_rows) = a(1:l_rows,l_cols+1)
if(nstor>0 .and. l_rows>0) then
call DGEMV('N',l_rows,2*nstor,1.d0,vur,ubound(vur,1), &
uvc(l_cols+1,1),ubound(uvc,1),1.d0,vr,1)
call DGEMV('N',l_rows,2*nstor,1.d0,vur,ubound(vur,dim=1), &
uvc(l_cols+1,1),ubound(uvc,dim=1),1.d0,vr,1)
endif
if(my_prow==prow(istep-1, nblk, np_rows)) then
......@@ -544,9 +551,9 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
! Transpose Householder vector vr -> vc
call elpa_transpose_vectors (vr, ubound(vr,1), mpi_comm_rows, &
vc, ubound(vc,1), mpi_comm_cols, &
1, istep-1, 1, nblk)
call elpa_transpose_vectors_real (vr, ubound(vr,dim=1), mpi_comm_rows, &
vc, ubound(vc,dim=1), mpi_comm_cols, &
1, istep-1, 1, nblk)
! Calculate u = (A + VU**T + UV**T)*v
......@@ -596,7 +603,7 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
enddo
enddo
#ifdef WITH_OPENMP
!$OMP END PARALLEL
!$OMP END PARALLEL
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
......@@ -607,8 +614,8 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
enddo
#endif
if(nstor>0) then
call DGEMV('T',l_rows,2*nstor,1.d0,vur,ubound(vur,1),vr,1,0.d0,aux,1)
call DGEMV('N',l_cols,2*nstor,1.d0,uvc,ubound(uvc,1),aux,1,1.d0,uc,1)
call DGEMV('T',l_rows,2*nstor,1.d0,vur,ubound(vur,dim=1),vr,1,0.d0,aux,1)
call DGEMV('N',l_cols,2*nstor,1.d0,uvc,ubound(uvc,dim=1),aux,1,1.d0,uc,1)
endif
endif
......@@ -619,8 +626,8 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
! global tile size is smaller than the global remaining matrix
if(tile_size < istep-1) then
call elpa_reduce_add_vectors (ur, ubound(ur,1), mpi_comm_rows, &
uc, ubound(uc,1), mpi_comm_cols, &
call elpa_reduce_add_vectors_REAL (ur, ubound(ur,dim=1), mpi_comm_rows, &
uc, ubound(uc,dim=1), mpi_comm_cols, &
istep-1, 1, nblk)
endif
......@@ -631,9 +638,9 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
call mpi_allreduce(tmp,uc,l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
endif
call elpa_transpose_vectors (uc, ubound(uc,1), mpi_comm_cols, &
ur, ubound(ur,1), mpi_comm_rows, &
1, istep-1, 1, nblk)
call elpa_transpose_vectors_real (uc, ubound(uc,dim=1), mpi_comm_cols, &
ur, ubound(ur,dim=1), mpi_comm_rows, &
1, istep-1, 1, nblk)
! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
......@@ -666,7 +673,7 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
lre = min(l_rows,(i+1)*l_rows_tile)
if(lce<lcs .or. lre<lrs) cycle
call dgemm('N','T',lre-lrs+1,lce-lcs+1,2*nstor,1.d0, &
vur(lrs,1),ubound(vur,1),uvc(lcs,1),ubound(uvc,1), &
vur(lrs,1),ubound(vur,dim=1),uvc(lcs,1),ubound(uvc,dim=1), &
1.d0,a(lrs,lcs),lda)
enddo
......@@ -709,7 +716,7 @@ end subroutine tridiag_real
!-------------------------------------------------------------------------------
subroutine trans_ev_real(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols)
subroutine trans_ev_real(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
!-------------------------------------------------------------------------------
! trans_ev_real: Transforms the eigenvectors of a tridiagonal matrix back
......@@ -722,10 +729,11 @@ subroutine trans_ev_real(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_
!
! nqc Number of columns of matrix q
!
! a(lda,*) Matrix containing the Householder vectors (i.e. matrix a after tridiag_real)
! a(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a after tridiag_real)
! Distribution is like in Scalapack.
!
! lda Leading dimension of a
! matrixCols local columns of matrix a and q
!
! tau(na) Factors of the Householder vectors
!
......@@ -747,8 +755,8 @@ subroutine trans_ev_real(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_
#endif
implicit none
integer na, nqc, lda, ldq, nblk, mpi_comm_rows, mpi_comm_cols
real*8 a(lda,*), q(ldq,*), tau(na)
integer na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real*8 a(lda,matrixCols), q(ldq,matrixCols), tau(na)
integer :: max_stored_rows
......@@ -839,7 +847,7 @@ subroutine trans_ev_real(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_
tmat = 0
if(l_rows>0) &
call dsyrk('U','T',nstor,l_rows,1.d0,hvm,ubound(hvm,1),0.d0,tmat,max_stored_rows)
call dsyrk('U','T',nstor,l_rows,1.d0,hvm,ubound(hvm,dim=1),0.d0,tmat,max_stored_rows)
nc = 0
do n=1,nstor-1
......@@ -863,7 +871,7 @@ subroutine trans_ev_real(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_
! Q = Q - V * T * V**T * Q
if(l_rows>0) then
call dgemm('T','N',nstor,l_cols,l_rows,1.d0,hvm,ubound(hvm,1), &
call dgemm('T','N',nstor,l_cols,l_rows,1.d0,hvm,ubound(hvm,dim=1), &
q,ldq,0.d0,tmp1,nstor)
else
tmp1(1:l_cols*nstor) = 0
......@@ -871,7 +879,7 @@ subroutine trans_ev_real(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_
call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
if(l_rows>0) then
call dtrmm('L','L','N','N',nstor,l_cols,1.0d0,tmat,max_stored_rows,tmp2,nstor)
call dgemm('N','N',l_rows,l_cols,nstor,-1.d0,hvm,ubound(hvm,1), &
call dgemm('N','N',l_rows,l_cols,nstor,-1.d0,hvm,ubound(hvm,dim=1), &
tmp2,nstor,1.d0,q,ldq)
endif
nstor = 0
......@@ -1078,7 +1086,7 @@ subroutine mult_at_b_real(uplo_a, uplo_c, na, ncb, a, lda, b, ldb, nblk, mpi_com
if(lcs<=lce) then
allocate(tmp1(nstor,lcs:lce),tmp2(nstor,lcs:lce))
if(lrs<=lre) then
call dgemm('T','N',nstor,lce-lcs+1,lre-lrs+1,1.d0,aux_mat(lrs,1),ubound(aux_mat,1), &
call dgemm('T','N',nstor,lce-lcs+1,lre-lrs+1,1.d0,aux_mat(lrs,1),ubound(aux_mat,dim=1), &
b(lrs,lcs),ldb,0.d0,tmp1,nstor)
else
tmp1 = 0
......@@ -1109,7 +1117,17 @@ end subroutine mult_at_b_real
!-------------------------------------------------------------------------------
subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, tau)
#define DATATYPE COMPLEX
#define BYTESIZE 16
#define COMPLEXCASE 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef COMPLEXCASE
subroutine tridiag_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau)
!-------------------------------------------------------------------------------
! tridiag_complex: Reduces a distributed hermitian matrix to tridiagonal form
......@@ -1119,12 +1137,13 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
!
! na Order of matrix
!
! a(lda,*) Distributed matrix which should be reduced.
! a(lda,matrixCols) Distributed matrix which should be reduced.
! Distribution is like in Scalapack.
! Opposed to PZHETRD, a(:,:) must be set completely (upper and lower half)
! a(:,:) is overwritten on exit with the Householder vectors
!
! lda Leading dimension of a
! matrixCols local columns of matrix a
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
......@@ -1144,8 +1163,8 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
#endif
implicit none
integer na, lda, nblk, mpi_comm_rows, mpi_comm_cols
complex*16 a(lda,*), tau(na)
integer na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
complex*16 a(lda,matrixCols), tau(na)
real*8 d(na), e(na)
integer, parameter :: max_stored_rows = 32
......@@ -1248,7 +1267,7 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
vr(1:l_rows) = a(1:l_rows,l_cols+1)
if(nstor>0 .and. l_rows>0) then
aux(1:2*nstor) = conjg(uvc(l_cols+1,1:2*nstor))
call ZGEMV('N',l_rows,2*nstor,CONE,vur,ubound(vur,1), &
call ZGEMV('N',l_rows,2*nstor,CONE,vur,ubound(vur,dim=1), &
aux,1,CONE,vr,1)
endif
......@@ -1288,10 +1307,13 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
! Transpose Householder vector vr -> vc
call elpa_transpose_vectors (vr, 2*ubound(vr,1), mpi_comm_rows, &
vc, 2*ubound(vc,1), mpi_comm_cols, &
1, 2*(istep-1), 1, 2*nblk)
! call elpa_transpose_vectors (vr, 2*ubound(vr,dim=1), mpi_comm_rows, &
! vc, 2*ubound(vc,dim=1), mpi_comm_cols, &
! 1, 2*(istep-1), 1, 2*nblk)
call elpa_transpose_vectors_complex (vr, ubound(vr,dim=1), mpi_comm_rows, &
vc, ubound(vc,dim=1), mpi_comm_cols, &
1, (istep-1), 1, nblk)
! Calculate u = (A + VU**T + UV**T)*v
! For cache efficiency, we use only the upper half of the matrix tiles for this,
......@@ -1352,8 +1374,8 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
#endif
if(nstor>0) then
call ZGEMV('C',l_rows,2*nstor,CONE,vur,ubound(vur,1),vr,1,CZERO,aux,1)
call ZGEMV('N',l_cols,2*nstor,CONE,uvc,ubound(uvc,1),aux,1,CONE,uc,1)
call ZGEMV('C',l_rows,2*nstor,CONE,vur,ubound(vur,dim=1),vr,1,CZERO,aux,1)
call ZGEMV('N',l_cols,2*nstor,CONE,uvc,ubound(uvc,dim=1),aux,1,CONE,uc,1)
endif
endif
......@@ -1364,9 +1386,9 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
! global tile size is smaller than the global remaining matrix
if(tile_size < istep-1) then
call elpa_reduce_add_vectors (ur, 2*ubound(ur,1), mpi_comm_rows, &
uc, 2*ubound(uc,1), mpi_comm_cols, &
2*(istep-1), 1, 2*nblk)
call elpa_reduce_add_vectors_COMPLEX (ur, ubound(ur,dim=1), mpi_comm_rows, &
uc, ubound(uc,dim=1), mpi_comm_cols, &
(istep-1), 1, nblk)
endif
! Sum up all the uc(:) parts, transpose uc -> ur
......@@ -1376,9 +1398,15 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
call mpi_allreduce(tmp,uc,l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
endif
call elpa_transpose_vectors (uc, 2*ubound(uc,1), mpi_comm_cols, &
ur, 2*ubound(ur,1), mpi_comm_rows, &
1, 2*(istep-1), 1, 2*nblk)
! call elpa_transpose_vectors (uc, 2*ubound(uc,dim=1), mpi_comm_cols, &
! ur, 2*ubound(ur,dim=1), mpi_comm_rows, &
! 1, 2*(istep-1), 1, 2*nblk)
call elpa_transpose_vectors_complex (uc, ubound(uc,dim=1), mpi_comm_cols, &
ur, ubound(ur,dim=1), mpi_comm_rows, &
1, (istep-1), 1, nblk)
! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
......@@ -1411,7 +1439,7 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
lre = min(l_rows,(i+1)*l_rows_tile)
if(lce<lcs .or. lre<lrs) cycle
call ZGEMM('N','C',lre-lrs+1,lce-lcs+1,2*nstor,CONE, &
vur(lrs,1),ubound(vur,1),uvc(lcs,1),ubound(uvc,1), &
vur(lrs,1),ubound(vur,dim=1),uvc(lcs,1),ubound(uvc,dim=1), &
CONE,a(lrs,lcs),lda)
enddo
......@@ -1465,7 +1493,7 @@ end subroutine tridiag_complex
!-------------------------------------------------------------------------------
subroutine trans_ev_complex(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols)
subroutine trans_ev_complex(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
!-------------------------------------------------------------------------------
! trans_ev_complex: Transforms the eigenvectors of a tridiagonal matrix back
......@@ -1478,7 +1506,7 @@ subroutine trans_ev_complex(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, m
!
! nqc Number of columns of matrix q
!
! a(lda,*) Matrix containing the Householder vectors (i.e. matrix a after tridiag_complex)
! a(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a after tridiag_complex)
! Distribution is like in Scalapack.
!
! lda Leading dimension of a
......@@ -1503,8 +1531,8 @@ subroutine trans_ev_complex(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, m
#endif
implicit none
integer na, nqc, lda, ldq, nblk, mpi_comm_rows, mpi_comm_cols
complex*16 a(lda,*), q(ldq,*), tau(na)
integer na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
complex*16 a(lda,matrixCols), q(ldq,matrixCols), tau(na)
integer :: max_stored_rows
......@@ -1601,7 +1629,7 @@ subroutine trans_ev_complex(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, m
tmat = 0
if(l_rows>0) &
call zherk('U','C',nstor,l_rows,CONE,hvm,ubound(hvm,1),CZERO,tmat,max_stored_rows)
call zherk('U','C',nstor,l_rows,CONE,hvm,ubound(hvm,dim=1),CZERO,tmat,max_stored_rows)
nc = 0
do n=1,nstor-1
......@@ -1625,7 +1653,7 @@ subroutine trans_ev_complex(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, m
! Q = Q - V * T * V**T * Q
if(l_rows>0) then
call zgemm('C','N',nstor,l_cols,l_rows,CONE,hvm,ubound(hvm,1), &
call zgemm('C','N',nstor,l_cols,l_rows,CONE,hvm,ubound(hvm,dim=1), &
q,ldq,CZERO,tmp1,nstor)
else
tmp1(1:l_cols*nstor) = 0
......@@ -1633,7 +1661,7 @@ subroutine trans_ev_complex(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, m
call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
if(l_rows>0) then
call ztrmm('L','L','N','N',nstor,l_cols,CONE,tmat,max_stored_rows,tmp2,nstor)
call zgemm('N','N',l_rows,l_cols,nstor,-CONE,hvm,ubound(hvm,1), &
call zgemm('N','N',l_rows,l_cols,nstor,-CONE,hvm,ubound(hvm,dim=1), &