Commit b48cf00a authored by Andreas Marek's avatar Andreas Marek
Browse files

Start to remove assumed size arrays

This commit is not ABI compatible, since it changes the interfaces
of some routines

Also, introduce type checking for transpose and reduce_add routines
parent bf168297
......@@ -88,6 +88,9 @@ module ELPA1
public :: hh_transform_real
public :: hh_transform_complex
public :: elpa_reduce_add_vectors_complex, elpa_reduce_add_vectors_real
public :: elpa_transpose_vectors_complex, elpa_transpose_vectors_real
!-------------------------------------------------------------------------------
! Timing results, set by every call to solve_evp_xxx
......@@ -149,7 +152,7 @@ end function get_elpa_row_col_comms
!-------------------------------------------------------------------------------
function solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols) result(success)
function solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) result(success)
!-------------------------------------------------------------------------------
! solve_evp_real: Solves the real eigenvalue problem
......@@ -161,7 +164,7 @@ function solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_co
! nev Number of eigenvalues needed.
! The smallest nev eigenvalues/eigenvectors are calculated.
!
! a(lda,*) Distributed matrix for which eigenvalues are to be computed.
! a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed.
! Distribution is like in Scalapack.
! The full matrix must be set (not only one half like in scalapack).
! Destroyed on exit (upper and lower half).
......@@ -170,7 +173,7 @@ function solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_co
!
! ev(na) On output: eigenvalues of a, every processor gets the complete set
!
! q(ldq,*) On output: Eigenvectors of a
! q(ldq,matrixCols) On output: Eigenvectors of a
! Distribution is like in Scalapack.
! Must be always dimensioned to the full size (corresponding to (na,na))
! even if only a part of the eigenvalues is needed.
......@@ -189,8 +192,8 @@ function solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_co
#endif
implicit none
integer, intent(in) :: na, nev, lda, ldq, nblk, mpi_comm_rows, mpi_comm_cols
real*8 :: a(lda,*), ev(na), q(ldq,*)
integer, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real*8 :: a(lda,matrixCols), ev(na), q(ldq,matrixCols)
integer :: my_prow, my_pcol, mpierr
real*8, allocatable :: e(:), tau(:)
......@@ -218,13 +221,13 @@ function solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_co
allocate(e(na), tau(na))
ttt0 = MPI_Wtime()
call tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
call tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0
time_evp_fwd = ttt1-ttt0
ttt0 = MPI_Wtime()
call solve_tridi(na, nev, ev, e, q, ldq, nblk, mpi_comm_rows, &
call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, wantDebug, success)
if (.not.(success)) return
......@@ -233,7 +236,7 @@ function solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_co
time_evp_solve = ttt1-ttt0
ttt0 = MPI_Wtime()
call trans_ev_real(na, nev, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols)
call trans_ev_real(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_real:',ttt1-ttt0
time_evp_back = ttt1-ttt0
......@@ -249,7 +252,7 @@ end function solve_evp_real
!-------------------------------------------------------------------------------
function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols) result(success)
function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) result(success)
!-------------------------------------------------------------------------------
! solve_evp_complex: Solves the complex eigenvalue problem
......@@ -261,7 +264,7 @@ function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi
! nev Number of eigenvalues needed
! The smallest nev eigenvalues/eigenvectors are calculated.
!
! a(lda,*) Distributed matrix for which eigenvalues are to be computed.
! a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed.
! Distribution is like in Scalapack.
! The full matrix must be set (not only one half like in scalapack).
! Destroyed on exit (upper and lower half).
......@@ -270,7 +273,7 @@ function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi
!
! ev(na) On output: eigenvalues of a, every processor gets the complete set
!
! q(ldq,*) On output: Eigenvectors of a
! q(ldq,matrixCols) On output: Eigenvectors of a
! Distribution is like in Scalapack.
! Must be always dimensioned to the full size (corresponding to (na,na))
! even if only a part of the eigenvalues is needed.
......@@ -290,8 +293,8 @@ function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi
implicit none
integer, intent(in) :: na, nev, lda, ldq, nblk, mpi_comm_rows, mpi_comm_cols
complex*16 :: a(lda,*), q(ldq,*)
integer, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
complex*16 :: a(lda,matrixCols), q(ldq,matrixCols)
real*8 :: ev(na)
integer :: my_prow, my_pcol, np_rows, np_cols, mpierr
......@@ -332,13 +335,13 @@ function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi
allocate(q_real(l_rows,l_cols))
ttt0 = MPI_Wtime()
call tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
call tridiag_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_complex :',ttt1-ttt0
time_evp_fwd = ttt1-ttt0
ttt0 = MPI_Wtime()
call solve_tridi(na, nev, ev, e, q_real, l_rows, nblk, mpi_comm_rows, &
call solve_tridi(na, nev, ev, e, q_real, l_rows, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, wantDebug, success)
if (.not.(success)) return
......@@ -349,7 +352,7 @@ function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi
ttt0 = MPI_Wtime()
q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
call trans_ev_complex(na, nev, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols)
call trans_ev_complex(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_complex:',ttt1-ttt0
time_evp_back = ttt1-ttt0
......@@ -362,9 +365,19 @@ function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi
end function solve_evp_complex
#define DATATYPE REAL
#define BYTESIZE 8
#define REALCASE 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef REALCASE
!-------------------------------------------------------------------------------
subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, tau)
subroutine tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau)
!-------------------------------------------------------------------------------
! tridiag_real: Reduces a distributed symmetric matrix to tridiagonal form
......@@ -374,12 +387,13 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
!
! na Order of matrix
!
! a(lda,*) Distributed matrix which should be reduced.
! a(lda,matrixCols) Distributed matrix which should be reduced.
! Distribution is like in Scalapack.
! Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
! a(:,:) is overwritten on exit with the Householder vectors
!
! lda Leading dimension of a
! matrixCols local columns of matrix
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
......@@ -399,8 +413,8 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
#endif
implicit none
integer na, lda, nblk, mpi_comm_rows, mpi_comm_cols
real*8 a(lda,*), d(na), e(na), tau(na)
integer na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real*8 a(lda,matrixCols), d(na), e(na), tau(na)
integer, parameter :: max_stored_rows = 32
......@@ -497,8 +511,8 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
vr(1:l_rows) = a(1:l_rows,l_cols+1)
if(nstor>0 .and. l_rows>0) then
call DGEMV('N',l_rows,2*nstor,1.d0,vur,ubound(vur,1), &
uvc(l_cols+1,1),ubound(uvc,1),1.d0,vr,1)
call DGEMV('N',l_rows,2*nstor,1.d0,vur,ubound(vur,dim=1), &
uvc(l_cols+1,1),ubound(uvc,dim=1),1.d0,vr,1)
endif
if(my_prow==prow(istep-1, nblk, np_rows)) then
......@@ -537,9 +551,9 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
! Transpose Householder vector vr -> vc
call elpa_transpose_vectors (vr, ubound(vr,1), mpi_comm_rows, &
vc, ubound(vc,1), mpi_comm_cols, &
1, istep-1, 1, nblk)
call elpa_transpose_vectors_real (vr, ubound(vr,dim=1), mpi_comm_rows, &
vc, ubound(vc,dim=1), mpi_comm_cols, &
1, istep-1, 1, nblk)
! Calculate u = (A + VU**T + UV**T)*v
......@@ -589,7 +603,7 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
enddo
enddo
#ifdef WITH_OPENMP
!$OMP END PARALLEL
!$OMP END PARALLEL
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
......@@ -600,8 +614,8 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
enddo
#endif
if(nstor>0) then
call DGEMV('T',l_rows,2*nstor,1.d0,vur,ubound(vur,1),vr,1,0.d0,aux,1)
call DGEMV('N',l_cols,2*nstor,1.d0,uvc,ubound(uvc,1),aux,1,1.d0,uc,1)
call DGEMV('T',l_rows,2*nstor,1.d0,vur,ubound(vur,dim=1),vr,1,0.d0,aux,1)
call DGEMV('N',l_cols,2*nstor,1.d0,uvc,ubound(uvc,dim=1),aux,1,1.d0,uc,1)
endif
endif
......@@ -612,8 +626,8 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
! global tile size is smaller than the global remaining matrix
if(tile_size < istep-1) then
call elpa_reduce_add_vectors (ur, ubound(ur,1), mpi_comm_rows, &
uc, ubound(uc,1), mpi_comm_cols, &
call elpa_reduce_add_vectors_REAL (ur, ubound(ur,dim=1), mpi_comm_rows, &
uc, ubound(uc,dim=1), mpi_comm_cols, &
istep-1, 1, nblk)
endif
......@@ -624,9 +638,9 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
call mpi_allreduce(tmp,uc,l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
endif
call elpa_transpose_vectors (uc, ubound(uc,1), mpi_comm_cols, &
ur, ubound(ur,1), mpi_comm_rows, &
1, istep-1, 1, nblk)
call elpa_transpose_vectors_real (uc, ubound(uc,dim=1), mpi_comm_cols, &
ur, ubound(ur,dim=1), mpi_comm_rows, &
1, istep-1, 1, nblk)
! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
......@@ -659,7 +673,7 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
lre = min(l_rows,(i+1)*l_rows_tile)
if(lce<lcs .or. lre<lrs) cycle
call dgemm('N','T',lre-lrs+1,lce-lcs+1,2*nstor,1.d0, &
vur(lrs,1),ubound(vur,1),uvc(lcs,1),ubound(uvc,1), &
vur(lrs,1),ubound(vur,dim=1),uvc(lcs,1),ubound(uvc,dim=1), &
1.d0,a(lrs,lcs),lda)
enddo
......@@ -702,7 +716,7 @@ end subroutine tridiag_real
!-------------------------------------------------------------------------------
subroutine trans_ev_real(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols)
subroutine trans_ev_real(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
!-------------------------------------------------------------------------------
! trans_ev_real: Transforms the eigenvectors of a tridiagonal matrix back
......@@ -715,10 +729,11 @@ subroutine trans_ev_real(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_
!
! nqc Number of columns of matrix q
!
! a(lda,*) Matrix containing the Householder vectors (i.e. matrix a after tridiag_real)
! a(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a after tridiag_real)
! Distribution is like in Scalapack.
!
! lda Leading dimension of a
! matrixCols local columns of matrix a and q
!
! tau(na) Factors of the Householder vectors
!
......@@ -740,8 +755,8 @@ subroutine trans_ev_real(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_
#endif
implicit none
integer na, nqc, lda, ldq, nblk, mpi_comm_rows, mpi_comm_cols
real*8 a(lda,*), q(ldq,*), tau(na)
integer na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real*8 a(lda,matrixCols), q(ldq,matrixCols), tau(na)
integer :: max_stored_rows
......@@ -832,7 +847,7 @@ subroutine trans_ev_real(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_
tmat = 0
if(l_rows>0) &
call dsyrk('U','T',nstor,l_rows,1.d0,hvm,ubound(hvm,1),0.d0,tmat,max_stored_rows)
call dsyrk('U','T',nstor,l_rows,1.d0,hvm,ubound(hvm,dim=1),0.d0,tmat,max_stored_rows)
nc = 0
do n=1,nstor-1
......@@ -856,7 +871,7 @@ subroutine trans_ev_real(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_
! Q = Q - V * T * V**T * Q
if(l_rows>0) then
call dgemm('T','N',nstor,l_cols,l_rows,1.d0,hvm,ubound(hvm,1), &
call dgemm('T','N',nstor,l_cols,l_rows,1.d0,hvm,ubound(hvm,dim=1), &
q,ldq,0.d0,tmp1,nstor)
else
tmp1(1:l_cols*nstor) = 0
......@@ -864,7 +879,7 @@ subroutine trans_ev_real(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_
call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
if(l_rows>0) then
call dtrmm('L','L','N','N',nstor,l_cols,1.0d0,tmat,max_stored_rows,tmp2,nstor)
call dgemm('N','N',l_rows,l_cols,nstor,-1.d0,hvm,ubound(hvm,1), &
call dgemm('N','N',l_rows,l_cols,nstor,-1.d0,hvm,ubound(hvm,dim=1), &
tmp2,nstor,1.d0,q,ldq)
endif
nstor = 0
......@@ -1071,7 +1086,7 @@ subroutine mult_at_b_real(uplo_a, uplo_c, na, ncb, a, lda, b, ldb, nblk, mpi_com
if(lcs<=lce) then
allocate(tmp1(nstor,lcs:lce),tmp2(nstor,lcs:lce))
if(lrs<=lre) then
call dgemm('T','N',nstor,lce-lcs+1,lre-lrs+1,1.d0,aux_mat(lrs,1),ubound(aux_mat,1), &
call dgemm('T','N',nstor,lce-lcs+1,lre-lrs+1,1.d0,aux_mat(lrs,1),ubound(aux_mat,dim=1), &
b(lrs,lcs),ldb,0.d0,tmp1,nstor)
else
tmp1 = 0
......@@ -1102,7 +1117,17 @@ end subroutine mult_at_b_real
!-------------------------------------------------------------------------------
subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, tau)
#define DATATYPE COMPLEX
#define BYTESIZE 16
#define COMPLEXCASE 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef COMPLEXCASE
subroutine tridiag_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau)
!-------------------------------------------------------------------------------
! tridiag_complex: Reduces a distributed hermitian matrix to tridiagonal form
......@@ -1112,12 +1137,13 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
!
! na Order of matrix
!
! a(lda,*) Distributed matrix which should be reduced.
! a(lda,matrixCols) Distributed matrix which should be reduced.
! Distribution is like in Scalapack.
! Opposed to PZHETRD, a(:,:) must be set completely (upper and lower half)
! a(:,:) is overwritten on exit with the Householder vectors
!
! lda Leading dimension of a
! matrixCols local columns of matrix a
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
......@@ -1137,8 +1163,8 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
#endif
implicit none
integer na, lda, nblk, mpi_comm_rows, mpi_comm_cols
complex*16 a(lda,*), tau(na)
integer na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
complex*16 a(lda,matrixCols), tau(na)
real*8 d(na), e(na)
integer, parameter :: max_stored_rows = 32
......@@ -1241,7 +1267,7 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
vr(1:l_rows) = a(1:l_rows,l_cols+1)
if(nstor>0 .and. l_rows>0) then
aux(1:2*nstor) = conjg(uvc(l_cols+1,1:2*nstor))
call ZGEMV('N',l_rows,2*nstor,CONE,vur,ubound(vur,1), &
call ZGEMV('N',l_rows,2*nstor,CONE,vur,ubound(vur,dim=1), &
aux,1,CONE,vr,1)
endif
......@@ -1281,10 +1307,13 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
! Transpose Householder vector vr -> vc
call elpa_transpose_vectors (vr, 2*ubound(vr,1), mpi_comm_rows, &
vc, 2*ubound(vc,1), mpi_comm_cols, &
1, 2*(istep-1), 1, 2*nblk)
! call elpa_transpose_vectors (vr, 2*ubound(vr,dim=1), mpi_comm_rows, &
! vc, 2*ubound(vc,dim=1), mpi_comm_cols, &
! 1, 2*(istep-1), 1, 2*nblk)
call elpa_transpose_vectors_complex (vr, ubound(vr,dim=1), mpi_comm_rows, &
vc, ubound(vc,dim=1), mpi_comm_cols, &
1, (istep-1), 1, nblk)
! Calculate u = (A + VU**T + UV**T)*v
! For cache efficiency, we use only the upper half of the matrix tiles for this,
......@@ -1345,8 +1374,8 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
#endif
if(nstor>0) then
call ZGEMV('C',l_rows,2*nstor,CONE,vur,ubound(vur,1),vr,1,CZERO,aux,1)
call ZGEMV('N',l_cols,2*nstor,CONE,uvc,ubound(uvc,1),aux,1,CONE,uc,1)
call ZGEMV('C',l_rows,2*nstor,CONE,vur,ubound(vur,dim=1),vr,1,CZERO,aux,1)
call ZGEMV('N',l_cols,2*nstor,CONE,uvc,ubound(uvc,dim=1),aux,1,CONE,uc,1)
endif
endif
......@@ -1357,9 +1386,9 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
! global tile size is smaller than the global remaining matrix
if(tile_size < istep-1) then
call elpa_reduce_add_vectors (ur, 2*ubound(ur,1), mpi_comm_rows, &
uc, 2*ubound(uc,1), mpi_comm_cols, &
2*(istep-1), 1, 2*nblk)
call elpa_reduce_add_vectors_COMPLEX (ur, ubound(ur,dim=1), mpi_comm_rows, &
uc, ubound(uc,dim=1), mpi_comm_cols, &
(istep-1), 1, nblk)
endif
! Sum up all the uc(:) parts, transpose uc -> ur
......@@ -1369,9 +1398,15 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
call mpi_allreduce(tmp,uc,l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
endif
call elpa_transpose_vectors (uc, 2*ubound(uc,1), mpi_comm_cols, &
ur, 2*ubound(ur,1), mpi_comm_rows, &
1, 2*(istep-1), 1, 2*nblk)
! call elpa_transpose_vectors (uc, 2*ubound(uc,dim=1), mpi_comm_cols, &
! ur, 2*ubound(ur,dim=1), mpi_comm_rows, &
! 1, 2*(istep-1), 1, 2*nblk)
call elpa_transpose_vectors_complex (uc, ubound(uc,dim=1), mpi_comm_cols, &
ur, ubound(ur,dim=1), mpi_comm_rows, &
1, (istep-1), 1, nblk)
! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
......@@ -1404,7 +1439,7 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
lre = min(l_rows,(i+1)*l_rows_tile)
if(lce<lcs .or. lre<lrs) cycle
call ZGEMM('N','C',lre-lrs+1,lce-lcs+1,2*nstor,CONE, &
vur(lrs,1),ubound(vur,1),uvc(lcs,1),ubound(uvc,1), &
vur(lrs,1),ubound(vur,dim=1),uvc(lcs,1),ubound(uvc,dim=1), &
CONE,a(lrs,lcs),lda)
enddo
......@@ -1458,7 +1493,7 @@ end subroutine tridiag_complex
!-------------------------------------------------------------------------------
subroutine trans_ev_complex(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols)
subroutine trans_ev_complex(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
!-------------------------------------------------------------------------------
! trans_ev_complex: Transforms the eigenvectors of a tridiagonal matrix back
......@@ -1471,7 +1506,7 @@ subroutine trans_ev_complex(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, m
!
! nqc Number of columns of matrix q
!
! a(lda,*) Matrix containing the Householder vectors (i.e. matrix a after tridiag_complex)
! a(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a after tridiag_complex)
! Distribution is like in Scalapack.
!
! lda Leading dimension of a
......@@ -1496,8 +1531,8 @@ subroutine trans_ev_complex(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, m
#endif
implicit none
integer na, nqc, lda, ldq, nblk, mpi_comm_rows, mpi_comm_cols
complex*16 a(lda,*), q(ldq,*), tau(na)
integer na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
complex*16 a(lda,matrixCols), q(ldq,matrixCols), tau(na)
integer :: max_stored_rows
......@@ -1594,7 +1629,7 @@ subroutine trans_ev_complex(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, m
tmat = 0
if(l_rows>0) &
call zherk('U','C',nstor,l_rows,CONE,hvm,ubound(hvm,1),CZERO,tmat,max_stored_rows)
call zherk('U','C',nstor,l_rows,CONE,hvm,ubound(hvm,dim=1),CZERO,tmat,max_stored_rows)
nc = 0
do n=1,nstor-1
......@@ -1618,7 +1653,7 @@ subroutine trans_ev_complex(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, m
! Q = Q - V * T * V**T * Q
if(l_rows>0) then
call zgemm('C','N',nstor,l_cols,l_rows,CONE,hvm,ubound(hvm,1), &
call zgemm('C','N',nstor,l_cols,l_rows,CONE,hvm,ubound(hvm,dim=1), &
q,ldq,CZERO,tmp1,nstor)
else
tmp1(1:l_cols*nstor) = 0
......@@ -1626,7 +1661,7 @@ subroutine trans_ev_complex(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, m
call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
if(l_rows>0) then
call ztrmm('L','L','N','N',nstor,l_cols,CONE,tmat,max_stored_rows,tmp2,nstor)
call zgemm('N','N',l_rows,l_cols,nstor,-CONE,hvm,ubound(hvm,1), &
call zgemm('N','N',l_rows,l_cols,nstor,-CONE,hvm,ubound(hvm,dim=1), &
tmp2,nstor,CONE,q,ldq)
endif
nstor = 0
......@@ -1833,7 +1868,7 @@ subroutine mult_ah_b_complex(uplo_a, uplo_c, na, ncb, a, lda, b, ldb, nblk, mpi_
if(lcs<=lce) then
allocate(tmp1(nstor,lcs:lce),tmp2(nstor,lcs:lce))
if(lrs<=lre) then
call zgemm('C','N',nstor,lce-lcs+1,lre-lrs+1,(1.d0,0.d0),aux_mat(lrs,1),ubound(aux_mat,1), &
call zgemm('C','N',nstor,lce-lcs+1,lre-lrs+1,(1.d0,0.d0),aux_mat(lrs,1),ubound(aux_mat,dim=1), &
b(lrs,lcs),ldb,(0.d0,0.d0),tmp1,nstor)
else
tmp1 = 0
......@@ -1864,14 +1899,14 @@ end subroutine mult_ah_b_complex
!-------------------------------------------------------------------------------
subroutine solve_tridi( na, nev, d, e, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols, wantDebug, success )
subroutine solve_tridi( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success )
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer na, nev, ldq, nblk, mpi_comm_rows, mpi_comm_cols
real*8 d(na), e(na), q(ldq,*)
integer na, nev, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real*8 d(na), e(na), q(ldq,matrixCols)
integer i, j, n, np, nc, nev1, l_cols, l_rows
integer my_prow, my_pcol, np_rows, np_cols, mpierr
......@@ -1939,7 +1974,7 @@ subroutine solve_tridi( na, nev, d, e, q, ldq, nblk, mpi_comm_rows, mpi_comm_col
nev1 = MIN(nev,l_cols)
endif
call solve_tridi_col(l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, &
mpi_comm_rows, wantDebug, success)
matrixCols, mpi_comm_rows, wantDebug, success)
if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("solve_tridi")
......@@ -2068,7 +2103,7 @@ recursive subroutine merge_recursive(np_off, nprocs, wantDebug, success)
! p_col_bc is set so that only nev eigenvalues are calculated
call merge_systems(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
nblk, mpi_comm_rows, mpi_comm_cols, l_col, p_col, &
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col, p_col, &
l_col_bc, p_col_bc, np_off, nprocs, wantDebug, success )
if (.not.(success)) return
else
......@@ -2076,7 +2111,7 @@ recursive subroutine merge_recursive(np_off, nprocs, wantDebug, success)
! Not last merge, leave dense column distribution
call merge_systems(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
nblk, mpi_comm_rows, mpi_comm_cols, l_col(noff+1), p_col(noff+1), &
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col(noff+1), p_col(noff+1), &
l_col(noff+1), p_col(noff+1), np_off, nprocs, wantDebug, success )
if (.not.(success)) return
endif
......@@ -2087,7 +2122,7 @@ end subroutine solve_tridi
!-------------------------------------------------------------------------------
subroutine solve_tridi_col( na, nev, nqoff, d, e, q, ldq, nblk, mpi_comm_rows, wantDebug, success )
subroutine solve_tridi_col( na, nev, nqoff, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, wantDebug, success )
! Solves the symmetric, tridiagonal eigenvalue problem on one processor column
! with the divide and conquer method.
......@@ -2097,8 +2132,8 @@ subroutine solve_tridi_col( na, nev, nqoff, d, e, q, ldq, nblk, mpi_comm_rows, w
#endif
implicit none
integer :: na, nev, nqoff, ldq, nblk, mpi_comm_rows
real*8 :: d(na), e(na), q(ldq,*)
integer :: na, nev, nqoff, ldq, nblk, matrixCols, mpi_comm_rows
real*8 :: d(na), e(na), q(ldq,matrixCols)
integer, parameter :: min_submatrix_size = 16 ! Minimum size of the submatrices to be used
......@@ -2172,7 +2207,7 @@ subroutine solve_tridi_col( na, nev, nqoff, d, e, q, ldq, nblk, mpi_comm_rows, w
nlen = limits(n+1)-noff ! Size of subproblem
call solve_tridi_single(nlen,d(noff+1),e(noff+1), &
q(nqoff+noff+1,noff+1),ubound(q,1), wantDebug, success)
q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantDebug, success)
if (.not.(success)) return
enddo
......@@ -2192,7 +2227,7 @@ subroutine solve_tridi_col( na, nev, nqoff, d, e, q, ldq, nblk, mpi_comm_rows, w
nlen = limits(my_prow+1)-noff ! Size of subproblem
call solve_tridi_single(nlen,d(noff+1),e(noff+1),qmat1, &
ubound(qmat1,1), wantDebug, success)
ubound(qmat1,dim=1), wantDebug, success)
if (.not.(success)) return
endif
......@@ -2245,7 +2280,7 @@ subroutine solve_tridi_col( na, nev, nqoff, d, e, q, ldq, nblk, mpi_comm_rows, w
endif
call merge_systems(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, &
mpi_comm_rows, mpi_comm_self, l_col(noff+1), p_col_i(noff+1), &
matrixCols, mpi_comm_rows, mpi_comm_self, l_col(noff+1), p_col_i(noff+1), &
l_col(noff+1), p_col_o(noff+1), 0, 1, wantDebug, success)
if (.not.(success)) return
......@@ -2364,17 +2399,17 @@ end subroutine solve_tridi_single