Commit b48cf00a by Andreas Marek

### Start to remove assumed size arrays

```This commit is not ABI compatible, since it changes the interfaces
of some routines

Also, introduce type checking for transpose and reduce_add routines```
parent bf168297
 ... ... @@ -88,6 +88,9 @@ module ELPA1 public :: hh_transform_real public :: hh_transform_complex public :: elpa_reduce_add_vectors_complex, elpa_reduce_add_vectors_real public :: elpa_transpose_vectors_complex, elpa_transpose_vectors_real !------------------------------------------------------------------------------- ! Timing results, set by every call to solve_evp_xxx ... ... @@ -149,7 +152,7 @@ end function get_elpa_row_col_comms !------------------------------------------------------------------------------- function solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols) result(success) function solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) result(success) !------------------------------------------------------------------------------- ! solve_evp_real: Solves the real eigenvalue problem ... ... @@ -161,7 +164,7 @@ function solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_co ! nev Number of eigenvalues needed. ! The smallest nev eigenvalues/eigenvectors are calculated. ! ! a(lda,*) Distributed matrix for which eigenvalues are to be computed. ! a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed. ! Distribution is like in Scalapack. ! The full matrix must be set (not only one half like in scalapack). ! Destroyed on exit (upper and lower half). ... ... @@ -170,7 +173,7 @@ function solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_co ! ! ev(na) On output: eigenvalues of a, every processor gets the complete set ! ! q(ldq,*) On output: Eigenvectors of a ! q(ldq,matrixCols) On output: Eigenvectors of a ! Distribution is like in Scalapack. ! Must be always dimensioned to the full size (corresponding to (na,na)) ! even if only a part of the eigenvalues is needed. ... ... @@ -189,8 +192,8 @@ function solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_co #endif implicit none integer, intent(in) :: na, nev, lda, ldq, nblk, mpi_comm_rows, mpi_comm_cols real*8 :: a(lda,*), ev(na), q(ldq,*) integer, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols real*8 :: a(lda,matrixCols), ev(na), q(ldq,matrixCols) integer :: my_prow, my_pcol, mpierr real*8, allocatable :: e(:), tau(:) ... ... @@ -218,13 +221,13 @@ function solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_co allocate(e(na), tau(na)) ttt0 = MPI_Wtime() call tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, ev, e, tau) call tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau) ttt1 = MPI_Wtime() if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0 time_evp_fwd = ttt1-ttt0 ttt0 = MPI_Wtime() call solve_tridi(na, nev, ev, e, q, ldq, nblk, mpi_comm_rows, & call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, & mpi_comm_cols, wantDebug, success) if (.not.(success)) return ... ... @@ -233,7 +236,7 @@ function solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_co time_evp_solve = ttt1-ttt0 ttt0 = MPI_Wtime() call trans_ev_real(na, nev, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols) call trans_ev_real(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) ttt1 = MPI_Wtime() if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_real:',ttt1-ttt0 time_evp_back = ttt1-ttt0 ... ... @@ -249,7 +252,7 @@ end function solve_evp_real !------------------------------------------------------------------------------- function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols) result(success) function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) result(success) !------------------------------------------------------------------------------- ! solve_evp_complex: Solves the complex eigenvalue problem ... ... @@ -261,7 +264,7 @@ function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi ! nev Number of eigenvalues needed ! The smallest nev eigenvalues/eigenvectors are calculated. ! ! a(lda,*) Distributed matrix for which eigenvalues are to be computed. ! a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed. ! Distribution is like in Scalapack. ! The full matrix must be set (not only one half like in scalapack). ! Destroyed on exit (upper and lower half). ... ... @@ -270,7 +273,7 @@ function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi ! ! ev(na) On output: eigenvalues of a, every processor gets the complete set ! ! q(ldq,*) On output: Eigenvectors of a ! q(ldq,matrixCols) On output: Eigenvectors of a ! Distribution is like in Scalapack. ! Must be always dimensioned to the full size (corresponding to (na,na)) ! even if only a part of the eigenvalues is needed. ... ... @@ -290,8 +293,8 @@ function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi implicit none integer, intent(in) :: na, nev, lda, ldq, nblk, mpi_comm_rows, mpi_comm_cols complex*16 :: a(lda,*), q(ldq,*) integer, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols complex*16 :: a(lda,matrixCols), q(ldq,matrixCols) real*8 :: ev(na) integer :: my_prow, my_pcol, np_rows, np_cols, mpierr ... ... @@ -332,13 +335,13 @@ function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi allocate(q_real(l_rows,l_cols)) ttt0 = MPI_Wtime() call tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, ev, e, tau) call tridiag_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau) ttt1 = MPI_Wtime() if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_complex :',ttt1-ttt0 time_evp_fwd = ttt1-ttt0 ttt0 = MPI_Wtime() call solve_tridi(na, nev, ev, e, q_real, l_rows, nblk, mpi_comm_rows, & call solve_tridi(na, nev, ev, e, q_real, l_rows, nblk, matrixCols, mpi_comm_rows, & mpi_comm_cols, wantDebug, success) if (.not.(success)) return ... ... @@ -349,7 +352,7 @@ function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi ttt0 = MPI_Wtime() q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev) call trans_ev_complex(na, nev, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols) call trans_ev_complex(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) ttt1 = MPI_Wtime() if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_complex:',ttt1-ttt0 time_evp_back = ttt1-ttt0 ... ... @@ -362,9 +365,19 @@ function solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi end function solve_evp_complex #define DATATYPE REAL #define BYTESIZE 8 #define REALCASE 1 #include "elpa_transpose_vectors.X90" #include "elpa_reduce_add_vectors.X90" #undef DATATYPE #undef BYTESIZE #undef REALCASE !------------------------------------------------------------------------------- subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, tau) subroutine tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau) !------------------------------------------------------------------------------- ! tridiag_real: Reduces a distributed symmetric matrix to tridiagonal form ... ... @@ -374,12 +387,13 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta ! ! na Order of matrix ! ! a(lda,*) Distributed matrix which should be reduced. ! a(lda,matrixCols) Distributed matrix which should be reduced. ! Distribution is like in Scalapack. ! Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half) ! a(:,:) is overwritten on exit with the Householder vectors ! ! lda Leading dimension of a ! matrixCols local columns of matrix ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ... ... @@ -399,8 +413,8 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta #endif implicit none integer na, lda, nblk, mpi_comm_rows, mpi_comm_cols real*8 a(lda,*), d(na), e(na), tau(na) integer na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols real*8 a(lda,matrixCols), d(na), e(na), tau(na) integer, parameter :: max_stored_rows = 32 ... ... @@ -497,8 +511,8 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta vr(1:l_rows) = a(1:l_rows,l_cols+1) if(nstor>0 .and. l_rows>0) then call DGEMV('N',l_rows,2*nstor,1.d0,vur,ubound(vur,1), & uvc(l_cols+1,1),ubound(uvc,1),1.d0,vr,1) call DGEMV('N',l_rows,2*nstor,1.d0,vur,ubound(vur,dim=1), & uvc(l_cols+1,1),ubound(uvc,dim=1),1.d0,vr,1) endif if(my_prow==prow(istep-1, nblk, np_rows)) then ... ... @@ -537,9 +551,9 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta ! Transpose Householder vector vr -> vc call elpa_transpose_vectors (vr, ubound(vr,1), mpi_comm_rows, & vc, ubound(vc,1), mpi_comm_cols, & 1, istep-1, 1, nblk) call elpa_transpose_vectors_real (vr, ubound(vr,dim=1), mpi_comm_rows, & vc, ubound(vc,dim=1), mpi_comm_cols, & 1, istep-1, 1, nblk) ! Calculate u = (A + VU**T + UV**T)*v ... ... @@ -589,7 +603,7 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta enddo enddo #ifdef WITH_OPENMP !\$OMP END PARALLEL !\$OMP END PARALLEL #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif ... ... @@ -600,8 +614,8 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta enddo #endif if(nstor>0) then call DGEMV('T',l_rows,2*nstor,1.d0,vur,ubound(vur,1),vr,1,0.d0,aux,1) call DGEMV('N',l_cols,2*nstor,1.d0,uvc,ubound(uvc,1),aux,1,1.d0,uc,1) call DGEMV('T',l_rows,2*nstor,1.d0,vur,ubound(vur,dim=1),vr,1,0.d0,aux,1) call DGEMV('N',l_cols,2*nstor,1.d0,uvc,ubound(uvc,dim=1),aux,1,1.d0,uc,1) endif endif ... ... @@ -612,8 +626,8 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta ! global tile size is smaller than the global remaining matrix if(tile_size < istep-1) then call elpa_reduce_add_vectors (ur, ubound(ur,1), mpi_comm_rows, & uc, ubound(uc,1), mpi_comm_cols, & call elpa_reduce_add_vectors_REAL (ur, ubound(ur,dim=1), mpi_comm_rows, & uc, ubound(uc,dim=1), mpi_comm_cols, & istep-1, 1, nblk) endif ... ... @@ -624,9 +638,9 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta call mpi_allreduce(tmp,uc,l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) endif call elpa_transpose_vectors (uc, ubound(uc,1), mpi_comm_cols, & ur, ubound(ur,1), mpi_comm_rows, & 1, istep-1, 1, nblk) call elpa_transpose_vectors_real (uc, ubound(uc,dim=1), mpi_comm_cols, & ur, ubound(ur,dim=1), mpi_comm_rows, & 1, istep-1, 1, nblk) ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v ) ... ... @@ -659,7 +673,7 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta lre = min(l_rows,(i+1)*l_rows_tile) if(lce0) & call dsyrk('U','T',nstor,l_rows,1.d0,hvm,ubound(hvm,1),0.d0,tmat,max_stored_rows) call dsyrk('U','T',nstor,l_rows,1.d0,hvm,ubound(hvm,dim=1),0.d0,tmat,max_stored_rows) nc = 0 do n=1,nstor-1 ... ... @@ -856,7 +871,7 @@ subroutine trans_ev_real(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_ ! Q = Q - V * T * V**T * Q if(l_rows>0) then call dgemm('T','N',nstor,l_cols,l_rows,1.d0,hvm,ubound(hvm,1), & call dgemm('T','N',nstor,l_cols,l_rows,1.d0,hvm,ubound(hvm,dim=1), & q,ldq,0.d0,tmp1,nstor) else tmp1(1:l_cols*nstor) = 0 ... ... @@ -864,7 +879,7 @@ subroutine trans_ev_real(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, mpi_ call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) if(l_rows>0) then call dtrmm('L','L','N','N',nstor,l_cols,1.0d0,tmat,max_stored_rows,tmp2,nstor) call dgemm('N','N',l_rows,l_cols,nstor,-1.d0,hvm,ubound(hvm,1), & call dgemm('N','N',l_rows,l_cols,nstor,-1.d0,hvm,ubound(hvm,dim=1), & tmp2,nstor,1.d0,q,ldq) endif nstor = 0 ... ... @@ -1071,7 +1086,7 @@ subroutine mult_at_b_real(uplo_a, uplo_c, na, ncb, a, lda, b, ldb, nblk, mpi_com if(lcs<=lce) then allocate(tmp1(nstor,lcs:lce),tmp2(nstor,lcs:lce)) if(lrs<=lre) then call dgemm('T','N',nstor,lce-lcs+1,lre-lrs+1,1.d0,aux_mat(lrs,1),ubound(aux_mat,1), & call dgemm('T','N',nstor,lce-lcs+1,lre-lrs+1,1.d0,aux_mat(lrs,1),ubound(aux_mat,dim=1), & b(lrs,lcs),ldb,0.d0,tmp1,nstor) else tmp1 = 0 ... ... @@ -1102,7 +1117,17 @@ end subroutine mult_at_b_real !------------------------------------------------------------------------------- subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, tau) #define DATATYPE COMPLEX #define BYTESIZE 16 #define COMPLEXCASE 1 #include "elpa_transpose_vectors.X90" #include "elpa_reduce_add_vectors.X90" #undef DATATYPE #undef BYTESIZE #undef COMPLEXCASE subroutine tridiag_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau) !------------------------------------------------------------------------------- ! tridiag_complex: Reduces a distributed hermitian matrix to tridiagonal form ... ... @@ -1112,12 +1137,13 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ! ! na Order of matrix ! ! a(lda,*) Distributed matrix which should be reduced. ! a(lda,matrixCols) Distributed matrix which should be reduced. ! Distribution is like in Scalapack. ! Opposed to PZHETRD, a(:,:) must be set completely (upper and lower half) ! a(:,:) is overwritten on exit with the Householder vectors ! ! lda Leading dimension of a ! matrixCols local columns of matrix a ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ... ... @@ -1137,8 +1163,8 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, #endif implicit none integer na, lda, nblk, mpi_comm_rows, mpi_comm_cols complex*16 a(lda,*), tau(na) integer na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols complex*16 a(lda,matrixCols), tau(na) real*8 d(na), e(na) integer, parameter :: max_stored_rows = 32 ... ... @@ -1241,7 +1267,7 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, vr(1:l_rows) = a(1:l_rows,l_cols+1) if(nstor>0 .and. l_rows>0) then aux(1:2*nstor) = conjg(uvc(l_cols+1,1:2*nstor)) call ZGEMV('N',l_rows,2*nstor,CONE,vur,ubound(vur,1), & call ZGEMV('N',l_rows,2*nstor,CONE,vur,ubound(vur,dim=1), & aux,1,CONE,vr,1) endif ... ... @@ -1281,10 +1307,13 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ! Transpose Householder vector vr -> vc call elpa_transpose_vectors (vr, 2*ubound(vr,1), mpi_comm_rows, & vc, 2*ubound(vc,1), mpi_comm_cols, & 1, 2*(istep-1), 1, 2*nblk) ! call elpa_transpose_vectors (vr, 2*ubound(vr,dim=1), mpi_comm_rows, & ! vc, 2*ubound(vc,dim=1), mpi_comm_cols, & ! 1, 2*(istep-1), 1, 2*nblk) call elpa_transpose_vectors_complex (vr, ubound(vr,dim=1), mpi_comm_rows, & vc, ubound(vc,dim=1), mpi_comm_cols, & 1, (istep-1), 1, nblk) ! Calculate u = (A + VU**T + UV**T)*v ! For cache efficiency, we use only the upper half of the matrix tiles for this, ... ... @@ -1345,8 +1374,8 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, #endif if(nstor>0) then call ZGEMV('C',l_rows,2*nstor,CONE,vur,ubound(vur,1),vr,1,CZERO,aux,1) call ZGEMV('N',l_cols,2*nstor,CONE,uvc,ubound(uvc,1),aux,1,CONE,uc,1) call ZGEMV('C',l_rows,2*nstor,CONE,vur,ubound(vur,dim=1),vr,1,CZERO,aux,1) call ZGEMV('N',l_cols,2*nstor,CONE,uvc,ubound(uvc,dim=1),aux,1,CONE,uc,1) endif endif ... ... @@ -1357,9 +1386,9 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ! global tile size is smaller than the global remaining matrix if(tile_size < istep-1) then call elpa_reduce_add_vectors (ur, 2*ubound(ur,1), mpi_comm_rows, & uc, 2*ubound(uc,1), mpi_comm_cols, & 2*(istep-1), 1, 2*nblk) call elpa_reduce_add_vectors_COMPLEX (ur, ubound(ur,dim=1), mpi_comm_rows, & uc, ubound(uc,dim=1), mpi_comm_cols, & (istep-1), 1, nblk) endif ! Sum up all the uc(:) parts, transpose uc -> ur ... ... @@ -1369,9 +1398,15 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, call mpi_allreduce(tmp,uc,l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr) endif call elpa_transpose_vectors (uc, 2*ubound(uc,1), mpi_comm_cols, & ur, 2*ubound(ur,1), mpi_comm_rows, & 1, 2*(istep-1), 1, 2*nblk) ! call elpa_transpose_vectors (uc, 2*ubound(uc,dim=1), mpi_comm_cols, & ! ur, 2*ubound(ur,dim=1), mpi_comm_rows, & ! 1, 2*(istep-1), 1, 2*nblk) call elpa_transpose_vectors_complex (uc, ubound(uc,dim=1), mpi_comm_cols, & ur, ubound(ur,dim=1), mpi_comm_rows, & 1, (istep-1), 1, nblk) ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v ) ... ... @@ -1404,7 +1439,7 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, lre = min(l_rows,(i+1)*l_rows_tile) if(lce0) & call zherk('U','C',nstor,l_rows,CONE,hvm,ubound(hvm,1),CZERO,tmat,max_stored_rows) call zherk('U','C',nstor,l_rows,CONE,hvm,ubound(hvm,dim=1),CZERO,tmat,max_stored_rows) nc = 0 do n=1,nstor-1 ... ... @@ -1618,7 +1653,7 @@ subroutine trans_ev_complex(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, m ! Q = Q - V * T * V**T * Q if(l_rows>0) then call zgemm('C','N',nstor,l_cols,l_rows,CONE,hvm,ubound(hvm,1), & call zgemm('C','N',nstor,l_cols,l_rows,CONE,hvm,ubound(hvm,dim=1), & q,ldq,CZERO,tmp1,nstor) else tmp1(1:l_cols*nstor) = 0 ... ... @@ -1626,7 +1661,7 @@ subroutine trans_ev_complex(na, nqc, a, lda, tau, q, ldq, nblk, mpi_comm_rows, m call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr) if(l_rows>0) then call ztrmm('L','L','N','N',nstor,l_cols,CONE,tmat,max_stored_rows,tmp2,nstor) call zgemm('N','N',l_rows,l_cols,nstor,-CONE,hvm,ubound(hvm,1), & call zgemm('N','N',l_rows,l_cols,nstor,-CONE,hvm,ubound(hvm,dim=1), & tmp2,nstor,CONE,q,ldq) endif nstor = 0 ... ... @@ -1833,7 +1868,7 @@ subroutine mult_ah_b_complex(uplo_a, uplo_c, na, ncb, a, lda, b, ldb, nblk, mpi_ if(lcs<=lce) then allocate(tmp1(nstor,lcs:lce),tmp2(nstor,lcs:lce)) if(lrs<=lre) then call zgemm('C','N',nstor,lce-lcs+1,lre-lrs+1,(1.d0,0.d0),aux_mat(lrs,1),ubound(aux_mat,1), & call zgemm('C','N',nstor,lce-lcs+1,lre-lrs+1,(1.d0,0.d0),aux_mat(lrs,1),ubound(aux_mat,dim=1), & b(lrs,lcs),ldb,(0.d0,0.d0),tmp1,nstor) else tmp1 = 0 ... ... @@ -1864,14 +1899,14 @@ end subroutine mult_ah_b_complex !------------------------------------------------------------------------------- subroutine solve_tridi( na, nev, d, e, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols, wantDebug, success ) subroutine solve_tridi( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success ) #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer na, nev, ldq, nblk, mpi_comm_rows, mpi_comm_cols real*8 d(na), e(na), q(ldq,*) integer na, nev, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols real*8 d(na), e(na), q(ldq,matrixCols) integer i, j, n, np, nc, nev1, l_cols, l_rows integer my_prow, my_pcol, np_rows, np_cols, mpierr ... ... @@ -1939,7 +1974,7 @@ subroutine solve_tridi( na, nev, d, e, q, ldq, nblk, mpi_comm_rows, mpi_comm_col nev1 = MIN(nev,l_cols) endif call solve_tridi_col(l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, & mpi_comm_rows, wantDebug, success) matrixCols, mpi_comm_rows, wantDebug, success) if (.not.(success)) then #ifdef HAVE_DETAILED_TIMINGS call timer%stop("solve_tridi") ... ... @@ -2068,7 +2103,7 @@ recursive subroutine merge_recursive(np_off, nprocs, wantDebug, success) ! p_col_bc is set so that only nev eigenvalues are calculated call merge_systems(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, & nblk, mpi_comm_rows, mpi_comm_cols, l_col, p_col, & nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col, p_col, & l_col_bc, p_col_bc, np_off, nprocs, wantDebug, success ) if (.not.(success)) return else ... ... @@ -2076,7 +2111,7 @@ recursive subroutine merge_recursive(np_off, nprocs, wantDebug, success) ! Not last merge, leave dense column distribution call merge_systems(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, & nblk, mpi_comm_rows, mpi_comm_cols, l_col(noff+1), p_col(noff+1), & nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col(noff+1), p_col(noff+1), & l_col(noff+1), p_col(noff+1), np_off, nprocs, wantDebug, success ) if (.not.(success)) return endif ... ... @@ -2087,7 +2122,7 @@ end subroutine solve_tridi !------------------------------------------------------------------------------- subroutine solve_tridi_col( na, nev, nqoff, d, e, q, ldq, nblk, mpi_comm_rows, wantDebug, success ) subroutine solve_tridi_col( na, nev, nqoff, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, wantDebug, success ) ! Solves the symmetric, tridiagonal eigenvalue problem on one processor column ! with the divide and conquer method. ... ... @@ -2097,8 +2132,8 @@ subroutine solve_tridi_col( na, nev, nqoff, d, e, q, ldq, nblk, mpi_comm_rows, w #endif implicit none integer :: na, nev, nqoff, ldq, nblk, mpi_comm_rows real*8 :: d(na), e(na), q(ldq,*) integer :: na, nev, nqoff, ldq, nblk, matrixCols, mpi_comm_rows real*8 :: d(na), e(na), q(ldq,matrixCols) integer, parameter :: min_submatrix_size = 16 ! Minimum size of the submatrices to be used ... ... @@ -2172,7 +2207,7 @@ subroutine solve_tridi_col( na, nev, nqoff, d, e, q, ldq, nblk, mpi_comm_rows, w nlen = limits(n+1)-noff ! Size of subproblem call solve_tridi_single(nlen,d(noff+1),e(noff+1), & q(nqoff+noff+1,noff+1),ubound(q,1), wantDebug, success) q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantDebug, success) if (.not.(success)) return enddo ... ... @@ -2192,7 +2227,7 @@ subroutine solve_tridi_col( na, nev, nqoff, d, e, q, ldq, nblk, mpi_comm_rows, w nlen = limits(my_prow+1)-noff ! Size of subproblem call solve_tridi_single(nlen,d(noff+1),e(noff+1),qmat1, & ubound(qmat1,1), wantDebug, success) ubound(qmat1,dim=1), wantDebug, success) if (.not.(success)) return endif ... ... @@ -2245,7 +2280,7 @@ subroutine solve_tridi_col( na, nev, nqoff, d, e, q, ldq, nblk, mpi_comm_rows, w endif call merge_systems(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, & mpi_comm_rows, mpi_comm_self, l_col(noff+1), p_col_i(noff+1), & matrixCols, mpi_comm_rows, mpi_comm_self, l_col(noff+1), p_col_i(noff+1), & l_col(noff+1), p_col_o(noff+1), 0, 1, wantDebug, success) if (.not.(success)) return ... ... @@ -2364,17 +2399,17 @@ end subroutine solve_tridi_single !-------------------------------------------------------------------------------