elpa1_solve_tridi_real_template.F90 23.5 KB
Newer Older
Pavel Kus's avatar
Pavel Kus committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

55
#include "../general/sanity.F90"
56

57
subroutine solve_tridi_&
58
&PRECISION_AND_SUFFIX &
Andreas Marek's avatar
Andreas Marek committed
59
  ( obj, na, nev, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
Andreas Marek's avatar
Andreas Marek committed
60
                                           mpi_comm_cols, useGPU, wantDebug, success, max_threads )
Pavel Kus's avatar
Pavel Kus committed
61

Andreas Marek's avatar
Andreas Marek committed
62 63 64
  use precision
  use elpa_abstract_impl
  implicit none
65
#include "../../src/general/precision_kinds.F90"
Andreas Marek's avatar
Andreas Marek committed
66 67 68
  class(elpa_abstract_impl_t), intent(inout) :: obj
  integer(kind=ik), intent(in)               :: na, nev, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
  real(kind=REAL_DATATYPE), intent(inout)    :: d(na), e(na)
Pavel Kus's avatar
Pavel Kus committed
69
#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
70
  real(kind=REAL_DATATYPE), intent(inout)    :: q(ldq,*)
Pavel Kus's avatar
Pavel Kus committed
71
#else
Andreas Marek's avatar
Andreas Marek committed
72
  real(kind=REAL_DATATYPE), intent(inout)    :: q(ldq,matrixCols)
Pavel Kus's avatar
Pavel Kus committed
73
#endif
Andreas Marek's avatar
Andreas Marek committed
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
  logical, intent(in)                        :: useGPU, wantDebug
  logical, intent(out)                       :: success

  integer(kind=ik)                           :: i, j, n, np, nc, nev1, l_cols, l_rows
  integer(kind=ik)                           :: my_prow, my_pcol, np_rows, np_cols
  integer(kind=MPI_KIND)                     :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
  integer(kind=ik), allocatable              :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:)

  integer(kind=ik)                           :: istat
  character(200)                             :: errorMessage
  character(20)                              :: gpuString
  integer(kind=ik), intent(in)               :: max_threads

  if(useGPU) then
    gpuString = "_gpu"
  else
    gpuString = ""
  endif

  call obj%timer%start("solve_tridi" // PRECISION_SUFFIX // gpuString)

  call obj%timer%start("mpi_communication")
  call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI, mpierr)
  call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI, mpierr)
  call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI, mpierr)
  call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI, mpierr)

  my_prow = int(my_prowMPI,kind=c_int)
  np_rows = int(np_rowsMPI,kind=c_int)
  my_pcol = int(my_pcolMPI,kind=c_int)
  np_cols = int(np_colsMPI,kind=c_int)

  call obj%timer%stop("mpi_communication")

  success = .true.

  l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
  l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q

  ! Set Q to 0
  q(1:l_rows, 1:l_cols) = 0.0_rk

  ! Get the limits of the subdivisons, each subdivison has as many cols
  ! as fit on the respective processor column

  allocate(limits(0:np_cols), stat=istat, errmsg=errorMessage)
  check_allocate("solve_tridi: limits", istat, errorMessage)

  limits(0) = 0
  do np=0,np_cols-1
    nc = local_index(na, np, np_cols, nblk, -1) ! number of columns on proc column np

    ! Check for the case that a column has have zero width.
    ! This is not supported!
    ! Scalapack supports it but delivers no results for these columns,
    ! which is rather annoying
    if (nc==0) then
      call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX)
      if (wantDebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width'
      success = .false.
      return
    endif
    limits(np+1) = limits(np) + nc
  enddo

  ! Subdivide matrix by subtracting rank 1 modifications

  do i=1,np_cols-1
    n = limits(i)
    d(n) = d(n)-abs(e(n))
    d(n+1) = d(n+1)-abs(e(n))
  enddo

  ! Solve sub problems on processsor columns

  nc = limits(my_pcol) ! column after which my problem starts

  if (np_cols>1) then
    nev1 = l_cols ! all eigenvectors are needed
  else
    nev1 = MIN(nev,l_cols)
  endif
  call solve_tridi_col_&
       &PRECISION_AND_SUFFIX &
         (obj, l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk,  &
                    matrixCols, mpi_comm_rows, useGPU, wantDebug, success, max_threads)
  if (.not.(success)) then
    call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX // gpuString)
    return
  endif
  ! If there is only 1 processor column, we are done

  if (np_cols==1) then
    deallocate(limits, stat=istat, errmsg=errorMessage)
    check_deallocate("solve_tridi: limits", istat, errorMessage)

    call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX // gpuString)
    return
  endif

  ! Set index arrays for Q columns

  ! Dense distribution scheme:

  allocate(l_col(na), stat=istat, errmsg=errorMessage)
  check_allocate("solve_tridi: l_col", istat, errorMessage)

  allocate(p_col(na), stat=istat, errmsg=errorMessage)
  check_allocate("solve_tridi: p_col", istat, errorMessage)

  n = 0
  do np=0,np_cols-1
    nc = local_index(na, np, np_cols, nblk, -1)
    do i=1,nc
      n = n+1
      l_col(n) = i
      p_col(n) = np
    enddo
  enddo

  ! Block cyclic distribution scheme, only nev columns are set:

  allocate(l_col_bc(na), stat=istat, errmsg=errorMessage)
  check_allocate("solve_tridi: l_col_bc", istat, errorMessage)

  allocate(p_col_bc(na), stat=istat, errmsg=errorMessage)
  check_allocate("solve_tridi: p_col_bc", istat, errorMessage)

  p_col_bc(:) = -1
  l_col_bc(:) = -1

  do i = 0, na-1, nblk*np_cols
    do j = 0, np_cols-1
      do n = 1, nblk
        if (i+j*nblk+n <= MIN(nev,na)) then
          p_col_bc(i+j*nblk+n) = j
          l_col_bc(i+j*nblk+n) = i/np_cols + n
         endif
       enddo
     enddo
  enddo

  ! Recursively merge sub problems
  call merge_recursive_&
       &PRECISION &
       (obj, 0, np_cols, useGPU, wantDebug, success)
  if (.not.(success)) then
    call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX // gpuString)
    return
  endif

  deallocate(limits,l_col,p_col,l_col_bc,p_col_bc, stat=istat, errmsg=errorMessage)
  check_deallocate("solve_tridi: limits, l_col, p_col, l_col_bc, p_col_bc", istat, errorMessage)

  call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX // gpuString)
  return

  contains
    recursive subroutine merge_recursive_&
    &PRECISION &
    (obj, np_off, nprocs, useGPU, wantDebug, success)
      use precision
      use elpa_abstract_impl
      implicit none
238

Andreas Marek's avatar
Andreas Marek committed
239 240
      ! noff is always a multiple of nblk_ev
      ! nlen-noff is always > nblk_ev
241

Andreas Marek's avatar
Andreas Marek committed
242 243 244 245 246
      class(elpa_abstract_impl_t), intent(inout) :: obj
      integer(kind=ik)     :: np_off, nprocs
      integer(kind=ik)     :: np1, np2, noff, nlen, nmid, n
      logical, intent(in)  :: useGPU, wantDebug
      logical, intent(out) :: success
247

Pavel Kus's avatar
Pavel Kus committed
248 249
      success = .true.

Andreas Marek's avatar
Andreas Marek committed
250 251 252 253
      if (nprocs<=1) then
        ! Safety check only
        if (wantDebug) write(error_unit,*) "ELPA1_merge_recursive: INTERNAL error merge_recursive: nprocs=",nprocs
        success = .false.
Pavel Kus's avatar
Pavel Kus committed
254 255
        return
      endif
Andreas Marek's avatar
Andreas Marek committed
256
      ! Split problem into 2 subproblems of size np1 / np2
Pavel Kus's avatar
Pavel Kus committed
257

Andreas Marek's avatar
Andreas Marek committed
258 259
      np1 = nprocs/2
      np2 = nprocs-np1
Pavel Kus's avatar
Pavel Kus committed
260

Andreas Marek's avatar
Andreas Marek committed
261 262 263 264 265 266 267 268
      if (np1 > 1) call merge_recursive_&
                   &PRECISION &
      (obj, np_off, np1, useGPU, wantDebug, success)
      if (.not.(success)) return
      if (np2 > 1) call merge_recursive_&
                   &PRECISION &
      (obj, np_off+np1, np2, useGPU, wantDebug, success)
      if (.not.(success)) return
Pavel Kus's avatar
Pavel Kus committed
269

Andreas Marek's avatar
Andreas Marek committed
270 271 272
      noff = limits(np_off)
      nmid = limits(np_off+np1) - noff
      nlen = limits(np_off+nprocs) - noff
Pavel Kus's avatar
Pavel Kus committed
273

Andreas Marek's avatar
Andreas Marek committed
274 275 276 277 278 279
#ifdef WITH_MPI
      call obj%timer%start("mpi_communication")
      if (my_pcol==np_off) then
        do n=np_off+np1,np_off+nprocs-1
          call mpi_send(d(noff+1), int(nmid,kind=MPI_KIND), MPI_REAL_PRECISION, int(n,kind=MPI_KIND), 1_MPI_KIND, &
                        int(mpi_comm_cols,kind=MPI_KIND), mpierr)
Pavel Kus's avatar
Pavel Kus committed
280 281
        enddo
      endif
Andreas Marek's avatar
Andreas Marek committed
282
      call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
283 284
#endif /* WITH_MPI */

Andreas Marek's avatar
Andreas Marek committed
285
      if (my_pcol>=np_off+np1 .and. my_pcol<np_off+nprocs) then
Pavel Kus's avatar
Pavel Kus committed
286
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
287 288 289 290
        call obj%timer%start("mpi_communication")
        call mpi_recv(d(noff+1), int(nmid,kind=MPI_KIND), MPI_REAL_PRECISION, int(np_off,kind=MPI_KIND), 1_MPI_KIND, &
                      int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
        call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
291
#else /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
292
!        d(noff+1:noff+1+nmid-1) = d(noff+1:noff+1+nmid-1)
Pavel Kus's avatar
Pavel Kus committed
293
#endif /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
294
      endif
Pavel Kus's avatar
Pavel Kus committed
295

Andreas Marek's avatar
Andreas Marek committed
296 297
      if (my_pcol==np_off+np1) then
        do n=np_off,np_off+np1-1
Pavel Kus's avatar
Pavel Kus committed
298
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
299 300 301 302
          call obj%timer%start("mpi_communication")
          call mpi_send(d(noff+nmid+1), int(nlen-nmid,kind=MPI_KIND), MPI_REAL_PRECISION, int(n,kind=MPI_KIND), &
                        1_MPI_KIND, int(mpi_comm_cols,kind=MPI_KIND), mpierr)
          call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
303 304
#endif /* WITH_MPI */

Andreas Marek's avatar
Andreas Marek committed
305 306 307
        enddo
      endif
      if (my_pcol>=np_off .and. my_pcol<np_off+np1) then
Pavel Kus's avatar
Pavel Kus committed
308
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
309 310 311 312
        call obj%timer%start("mpi_communication")
        call mpi_recv(d(noff+nmid+1), int(nlen-nmid,kind=MPI_KIND), MPI_REAL_PRECISION, int(np_off+np1,kind=MPI_KIND), &
                      1_MPI_KIND, int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
        call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
313
#else /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
314
!        d(noff+nmid+1:noff+nmid+1+nlen-nmid-1) = d(noff+nmid+1:noff+nmid+1+nlen-nmid-1)
Pavel Kus's avatar
Pavel Kus committed
315
#endif /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
      endif
      if (nprocs == np_cols) then

        ! Last merge, result distribution must be block cyclic, noff==0,
        ! p_col_bc is set so that only nev eigenvalues are calculated
        call merge_systems_&
             &PRECISION &
                            (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
                            nblk, matrixCols, int(mpi_comm_rows,kind=ik), int(mpi_comm_cols,kind=ik), &
                            l_col, p_col, &
                            l_col_bc, p_col_bc, np_off, nprocs, useGPU, wantDebug, success, max_threads )
        if (.not.(success)) return
      else
        ! Not last merge, leave dense column distribution
        call merge_systems_&
             &PRECISION &
                           (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
                            nblk, matrixCols, int(mpi_comm_rows,kind=ik), int(mpi_comm_cols,kind=ik), &
                            l_col(noff+1), p_col(noff+1), &
                            l_col(noff+1), p_col(noff+1), np_off, nprocs, useGPU, wantDebug, success, max_threads )
        if (.not.(success)) return
      endif
    end subroutine merge_recursive_&
Andreas Marek's avatar
cleanup  
Andreas Marek committed
339
           &PRECISION
Pavel Kus's avatar
Pavel Kus committed
340

Andreas Marek's avatar
Andreas Marek committed
341 342
end subroutine solve_tridi_&
&PRECISION_AND_SUFFIX
Pavel Kus's avatar
Pavel Kus committed
343

Andreas Marek's avatar
Andreas Marek committed
344 345 346 347 348 349 350 351 352 353 354 355 356 357
subroutine solve_tridi_col_&
&PRECISION_AND_SUFFIX &
( obj, na, nev, nqoff, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, useGPU, wantDebug, success, max_threads )

  ! Solves the symmetric, tridiagonal eigenvalue problem on one processor column
  ! with the divide and conquer method.
  ! Works best if the number of processor rows is a power of 2!
  use precision
  use elpa_abstract_impl
  implicit none
  class(elpa_abstract_impl_t), intent(inout) :: obj

  integer(kind=ik)              :: na, nev, nqoff, ldq, nblk, matrixCols, mpi_comm_rows
  real(kind=REAL_DATATYPE)      :: d(na), e(na)
Pavel Kus's avatar
Pavel Kus committed
358
#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
359
  real(kind=REAL_DATATYPE)      :: q(ldq,*)
Pavel Kus's avatar
Pavel Kus committed
360
#else
Andreas Marek's avatar
Andreas Marek committed
361
  real(kind=REAL_DATATYPE)      :: q(ldq,matrixCols)
Pavel Kus's avatar
Pavel Kus committed
362 363
#endif

Andreas Marek's avatar
Andreas Marek committed
364
  integer(kind=ik), parameter   :: min_submatrix_size = 16 ! Minimum size of the submatrices to be used
Pavel Kus's avatar
Pavel Kus committed
365

Andreas Marek's avatar
Andreas Marek committed
366 367 368 369 370
  real(kind=REAL_DATATYPE), allocatable    :: qmat1(:,:), qmat2(:,:)
  integer(kind=ik)              :: i, n, np
  integer(kind=ik)              :: ndiv, noff, nmid, nlen, max_size
  integer(kind=ik)              :: my_prow, np_rows
  integer(kind=MPI_KIND)        :: mpierr, my_prowMPI, np_rowsMPI
Pavel Kus's avatar
Pavel Kus committed
371

Andreas Marek's avatar
Andreas Marek committed
372 373 374 375 376
  integer(kind=ik), allocatable :: limits(:), l_col(:), p_col_i(:), p_col_o(:)
  logical, intent(in)           :: useGPU, wantDebug
  logical, intent(out)          :: success
  integer(kind=ik)              :: istat
  character(200)                :: errorMessage
Pavel Kus's avatar
Pavel Kus committed
377

Andreas Marek's avatar
Andreas Marek committed
378
  integer(kind=ik), intent(in)  :: max_threads
Andreas Marek's avatar
Andreas Marek committed
379

Andreas Marek's avatar
Andreas Marek committed
380 381 382 383
  call obj%timer%start("solve_tridi_col" // PRECISION_SUFFIX)
  call obj%timer%start("mpi_communication")
  call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND), my_prowMPI, mpierr)
  call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND), np_rowsMPI, mpierr)
384

Andreas Marek's avatar
Andreas Marek committed
385 386 387 388 389
  my_prow = int(my_prowMPI,kind=c_int)
  np_rows = int(np_rowsMPI,kind=c_int)
  call obj%timer%stop("mpi_communication")
  success = .true.
  ! Calculate the number of subdivisions needed.
Pavel Kus's avatar
Pavel Kus committed
390

Andreas Marek's avatar
Andreas Marek committed
391 392 393 394 395 396
  n = na
  ndiv = 1
  do while(2*ndiv<=np_rows .and. n>2*min_submatrix_size)
    n = ((n+3)/4)*2 ! the bigger one of the two halves, we want EVEN boundaries
    ndiv = ndiv*2
  enddo
Pavel Kus's avatar
Pavel Kus committed
397

Andreas Marek's avatar
Andreas Marek committed
398 399 400 401
  ! If there is only 1 processor row and not all eigenvectors are needed
  ! and the matrix size is big enough, then use 2 subdivisions
  ! so that merge_systems is called once and only the needed
  ! eigenvectors are calculated for the final problem.
Pavel Kus's avatar
Pavel Kus committed
402

Andreas Marek's avatar
Andreas Marek committed
403
  if (np_rows==1 .and. nev<na .and. na>2*min_submatrix_size) ndiv = 2
Pavel Kus's avatar
Pavel Kus committed
404

Andreas Marek's avatar
Andreas Marek committed
405 406
  allocate(limits(0:ndiv), stat=istat, errmsg=errorMessage)
  check_deallocate("solve_tridi_col: limits", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
407

Andreas Marek's avatar
Andreas Marek committed
408 409
  limits(0) = 0
  limits(ndiv) = na
Pavel Kus's avatar
Pavel Kus committed
410

Andreas Marek's avatar
Andreas Marek committed
411 412 413 414 415 416 417 418
  n = ndiv
  do while(n>1)
    n = n/2 ! n is always a power of 2
    do i=0,ndiv-1,2*n
      ! We want to have even boundaries (for cache line alignments)
      limits(i+n) = limits(i) + ((limits(i+2*n)-limits(i)+3)/4)*2
    enddo
  enddo
Pavel Kus's avatar
Pavel Kus committed
419

Andreas Marek's avatar
Andreas Marek committed
420
  ! Calculate the maximum size of a subproblem
Pavel Kus's avatar
Pavel Kus committed
421

Andreas Marek's avatar
Andreas Marek committed
422 423 424 425
  max_size = 0
  do i=1,ndiv
    max_size = MAX(max_size,limits(i)-limits(i-1))
  enddo
Pavel Kus's avatar
Pavel Kus committed
426

Andreas Marek's avatar
Andreas Marek committed
427
  ! Subdivide matrix by subtracting rank 1 modifications
Pavel Kus's avatar
Pavel Kus committed
428

Andreas Marek's avatar
Andreas Marek committed
429 430 431 432 433
  do i=1,ndiv-1
    n = limits(i)
    d(n) = d(n)-abs(e(n))
    d(n+1) = d(n+1)-abs(e(n))
  enddo
Pavel Kus's avatar
Pavel Kus committed
434

Andreas Marek's avatar
Andreas Marek committed
435
  if (np_rows==1)    then
Pavel Kus's avatar
Pavel Kus committed
436

Andreas Marek's avatar
Andreas Marek committed
437 438 439 440
    ! For 1 processor row there may be 1 or 2 subdivisions
    do n=0,ndiv-1
      noff = limits(n)        ! Start of subproblem
      nlen = limits(n+1)-noff ! Size of subproblem
Pavel Kus's avatar
Pavel Kus committed
441

Andreas Marek's avatar
Andreas Marek committed
442 443 444 445
      call solve_tridi_single_problem_&
      &PRECISION_AND_SUFFIX &
                              (obj, nlen,d(noff+1),e(noff+1), &
                                q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantDebug, success)
446

Andreas Marek's avatar
Andreas Marek committed
447 448
      if (.not.(success)) return
    enddo
Pavel Kus's avatar
Pavel Kus committed
449

Andreas Marek's avatar
Andreas Marek committed
450
  else
Pavel Kus's avatar
Pavel Kus committed
451

Andreas Marek's avatar
Andreas Marek committed
452 453
    ! Solve sub problems in parallel with solve_tridi_single
    ! There is at maximum 1 subproblem per processor
Pavel Kus's avatar
Pavel Kus committed
454

Andreas Marek's avatar
Andreas Marek committed
455 456
    allocate(qmat1(max_size,max_size), stat=istat, errmsg=errorMessage)
    check_deallocate("solve_tridi_col: qmat1", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
457

Andreas Marek's avatar
Andreas Marek committed
458 459
    allocate(qmat2(max_size,max_size), stat=istat, errmsg=errorMessage)
    check_deallocate("solve_tridi_col: qmat2", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
460

Andreas Marek's avatar
Andreas Marek committed
461
    qmat1 = 0 ! Make sure that all elements are defined
Pavel Kus's avatar
Pavel Kus committed
462

Andreas Marek's avatar
Andreas Marek committed
463
    if (my_prow < ndiv) then
Pavel Kus's avatar
Pavel Kus committed
464

Andreas Marek's avatar
Andreas Marek committed
465 466 467 468 469 470
      noff = limits(my_prow)        ! Start of subproblem
      nlen = limits(my_prow+1)-noff ! Size of subproblem
      call solve_tridi_single_problem_&
      &PRECISION_AND_SUFFIX &
                                (obj, nlen,d(noff+1),e(noff+1),qmat1, &
                                ubound(qmat1,dim=1), wantDebug, success)
471

Andreas Marek's avatar
Andreas Marek committed
472 473
      if (.not.(success)) return
    endif
Pavel Kus's avatar
Pavel Kus committed
474

Andreas Marek's avatar
Andreas Marek committed
475
    ! Fill eigenvectors in qmat1 into global matrix q
Pavel Kus's avatar
Pavel Kus committed
476

Andreas Marek's avatar
Andreas Marek committed
477
    do np = 0, ndiv-1
Pavel Kus's avatar
Pavel Kus committed
478

Andreas Marek's avatar
Andreas Marek committed
479 480
      noff = limits(np)
      nlen = limits(np+1)-noff
Pavel Kus's avatar
Pavel Kus committed
481
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
482 483 484 485 486 487 488
      call obj%timer%start("mpi_communication")
      call MPI_Bcast(d(noff+1), int(nlen,kind=MPI_KIND), MPI_REAL_PRECISION, int(np,kind=MPI_KIND), &
                     int(mpi_comm_rows,kind=MPI_KIND), mpierr)
      qmat2 = qmat1
      call MPI_Bcast(qmat2, int(max_size*max_size,kind=MPI_KIND), MPI_REAL_PRECISION, int(np,kind=MPI_KIND), &
                     int(mpi_comm_rows,kind=MPI_KIND), mpierr)
      call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
489
#else /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
490
!      qmat2 = qmat1 ! is this correct
Pavel Kus's avatar
Pavel Kus committed
491
#endif /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
492
      do i=1,nlen
Pavel Kus's avatar
Pavel Kus committed
493 494

#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
495 496 497
        call distribute_global_column_&
        &PRECISION &
               (obj, qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
Pavel Kus's avatar
Pavel Kus committed
498
#else /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
499 500 501
        call distribute_global_column_&
        &PRECISION &
             (obj, qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
Pavel Kus's avatar
Pavel Kus committed
502
#endif /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
503
      enddo
Pavel Kus's avatar
Pavel Kus committed
504

Andreas Marek's avatar
Andreas Marek committed
505
    enddo
Pavel Kus's avatar
Pavel Kus committed
506

Andreas Marek's avatar
Andreas Marek committed
507 508
    deallocate(qmat1, qmat2, stat=istat, errmsg=errorMessage)
    check_deallocate("solve_tridi_col: qmat1, qmat2", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
509

Andreas Marek's avatar
Andreas Marek committed
510
  endif
Pavel Kus's avatar
Pavel Kus committed
511

Andreas Marek's avatar
Andreas Marek committed
512
  ! Allocate and set index arrays l_col and p_col
Pavel Kus's avatar
Pavel Kus committed
513

Andreas Marek's avatar
Andreas Marek committed
514 515
  allocate(l_col(na), p_col_i(na),  p_col_o(na), stat=istat, errmsg=errorMessage)
  check_deallocate("solve_tridi_col: l_col, p_col_i, p_col_o", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
516

Andreas Marek's avatar
Andreas Marek committed
517 518 519 520 521
  do i=1,na
    l_col(i) = i
    p_col_i(i) = 0
    p_col_o(i) = 0
  enddo
Pavel Kus's avatar
Pavel Kus committed
522

Andreas Marek's avatar
Andreas Marek committed
523
  ! Merge subproblems
Pavel Kus's avatar
Pavel Kus committed
524

Andreas Marek's avatar
Andreas Marek committed
525 526
  n = 1
  do while(n<ndiv) ! if ndiv==1, the problem was solved by single call to solve_tridi_single
Pavel Kus's avatar
Pavel Kus committed
527

Andreas Marek's avatar
Andreas Marek committed
528
    do i=0,ndiv-1,2*n
Pavel Kus's avatar
Pavel Kus committed
529

Andreas Marek's avatar
Andreas Marek committed
530 531 532
      noff = limits(i)
      nmid = limits(i+n) - noff
      nlen = limits(i+2*n) - noff
Pavel Kus's avatar
Pavel Kus committed
533

Andreas Marek's avatar
Andreas Marek committed
534 535 536 537 538 539 540 541 542 543 544
      if (nlen == na) then
        ! Last merge, set p_col_o=-1 for unneeded (output) eigenvectors
        p_col_o(nev+1:na) = -1
      endif
      call merge_systems_&
      &PRECISION &
                          (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, &
                           matrixCols, int(mpi_comm_rows,kind=ik), int(mpi_comm_self,kind=ik), &
                           l_col(noff+1), p_col_i(noff+1), &
                           l_col(noff+1), p_col_o(noff+1), 0, 1, useGPU, wantDebug, success, max_threads)
      if (.not.(success)) return
Pavel Kus's avatar
Pavel Kus committed
545

Andreas Marek's avatar
Andreas Marek committed
546
    enddo
Pavel Kus's avatar
Pavel Kus committed
547

Andreas Marek's avatar
Andreas Marek committed
548
    n = 2*n
Pavel Kus's avatar
Pavel Kus committed
549

Andreas Marek's avatar
Andreas Marek committed
550
  enddo
Pavel Kus's avatar
Pavel Kus committed
551

Andreas Marek's avatar
Andreas Marek committed
552 553
  deallocate(limits, l_col, p_col_i, p_col_o, stat=istat, errmsg=errorMessage)
  check_deallocate("solve_tridi_col: limits, l_col, p_col_i, p_col_o", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
554

Andreas Marek's avatar
Andreas Marek committed
555
  call obj%timer%stop("solve_tridi_col" // PRECISION_SUFFIX)
Pavel Kus's avatar
Pavel Kus committed
556

Andreas Marek's avatar
Andreas Marek committed
557 558
end subroutine solve_tridi_col_&
&PRECISION_AND_SUFFIX
Pavel Kus's avatar
Pavel Kus committed
559

Andreas Marek's avatar
Andreas Marek committed
560 561 562
subroutine solve_tridi_single_problem_&
&PRECISION_AND_SUFFIX &
(obj, nlen, d, e, q, ldq, wantDebug, success)
Pavel Kus's avatar
Pavel Kus committed
563

Andreas Marek's avatar
Andreas Marek committed
564 565 566 567 568
  ! Solves the symmetric, tridiagonal eigenvalue problem on a single processor.
  ! Takes precautions if DSTEDC fails or if the eigenvalues are not ordered correctly.
  use precision
  use elpa_abstract_impl
  use elpa_blas_interfaces
569

Andreas Marek's avatar
Andreas Marek committed
570 571 572 573
  implicit none
  class(elpa_abstract_impl_t), intent(inout) :: obj
  integer(kind=ik)                         :: nlen, ldq
  real(kind=REAL_DATATYPE)                 :: d(nlen), e(nlen), q(ldq,nlen)
Pavel Kus's avatar
Pavel Kus committed
574

Andreas Marek's avatar
Andreas Marek committed
575 576
  real(kind=REAL_DATATYPE), allocatable    :: work(:), qtmp(:), ds(:), es(:)
  real(kind=REAL_DATATYPE)                 :: dtmp
Pavel Kus's avatar
Pavel Kus committed
577

Andreas Marek's avatar
Andreas Marek committed
578 579 580
  integer(kind=ik)              :: i, j, lwork, liwork, info
  integer(kind=BLAS_KIND)       :: infoBLAS
  integer(kind=ik), allocatable :: iwork(:)
Pavel Kus's avatar
Pavel Kus committed
581

Andreas Marek's avatar
Andreas Marek committed
582 583 584 585
  logical, intent(in)           :: wantDebug
  logical, intent(out)          :: success
  integer(kind=ik)             :: istat
  character(200)               :: errorMessage
Pavel Kus's avatar
Pavel Kus committed
586

Andreas Marek's avatar
Andreas Marek committed
587
  call obj%timer%start("solve_tridi_single" // PRECISION_SUFFIX)
Pavel Kus's avatar
Pavel Kus committed
588

Andreas Marek's avatar
Andreas Marek committed
589 590 591
  success = .true.
  allocate(ds(nlen), es(nlen), stat=istat, errmsg=errorMessage)
  check_allocate("solve_tridi_single: ds, es", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
592

Andreas Marek's avatar
Andreas Marek committed
593
  ! Save d and e for the case that dstedc fails
Pavel Kus's avatar
Pavel Kus committed
594

Andreas Marek's avatar
Andreas Marek committed
595 596
  ds(:) = d(:)
  es(:) = e(:)
Pavel Kus's avatar
Pavel Kus committed
597

Andreas Marek's avatar
Andreas Marek committed
598
  ! First try dstedc, this is normally faster but it may fail sometimes (why???)
Pavel Kus's avatar
Pavel Kus committed
599

Andreas Marek's avatar
Andreas Marek committed
600 601 602 603 604 605 606 607 608 609
  lwork = 1 + 4*nlen + nlen**2
  liwork =  3 + 5*nlen
  allocate(work(lwork), iwork(liwork), stat=istat, errmsg=errorMessage)
  check_allocate("solve_tridi_single: work, iwork", istat, errorMessage)
  call obj%timer%start("blas")
  call PRECISION_STEDC('I', int(nlen,kind=BLAS_KIND), d, e, q, int(ldq,kind=BLAS_KIND),    &
                       work, int(lwork,kind=BLAS_KIND), int(iwork,kind=BLAS_KIND), int(liwork,kind=BLAS_KIND), &
                       infoBLAS)
  info = int(infoBLAS,kind=ik)
  call obj%timer%stop("blas")
Pavel Kus's avatar
Pavel Kus committed
610

Andreas Marek's avatar
Andreas Marek committed
611
  if (info /= 0) then
Pavel Kus's avatar
Pavel Kus committed
612

Andreas Marek's avatar
Andreas Marek committed
613
    ! DSTEDC failed, try DSTEQR. The workspace is enough for DSTEQR.
Pavel Kus's avatar
Pavel Kus committed
614

Andreas Marek's avatar
Andreas Marek committed
615
    write(error_unit,'(a,i8,a)') 'Warning: Lapack routine DSTEDC failed, info= ',info,', Trying DSTEQR!'
Pavel Kus's avatar
Pavel Kus committed
616

Andreas Marek's avatar
Andreas Marek committed
617 618 619 620 621 622
    d(:) = ds(:)
    e(:) = es(:)
    call obj%timer%start("blas")
    call PRECISION_STEQR('I', int(nlen,kind=BLAS_KIND), d, e, q, int(ldq,kind=BLAS_KIND), work, infoBLAS )
    info = int(infoBLAS,kind=ik)
    call obj%timer%stop("blas")
623

Andreas Marek's avatar
Andreas Marek committed
624
    ! If DSTEQR fails also, we don't know what to do further ...
Pavel Kus's avatar
Pavel Kus committed
625

Andreas Marek's avatar
Andreas Marek committed
626 627 628 629 630 631 632
    if (info /= 0) then
      if (wantDebug) &
      write(error_unit,'(a,i8,a)') 'ELPA1_solve_tridi_single: ERROR: Lapack routine DSTEQR failed, info= ',info,', Aborting!'
      success = .false.
      return
    endif
  end if
Pavel Kus's avatar
Pavel Kus committed
633

Andreas Marek's avatar
Andreas Marek committed
634 635
  deallocate(work,iwork,ds,es, stat=istat, errmsg=errorMessage)
  check_deallocate("solve_tridi_single: work, iwork, ds, es", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
636

Andreas Marek's avatar
Andreas Marek committed
637 638
  ! Check if eigenvalues are monotonically increasing
  ! This seems to be not always the case  (in the IBM implementation of dstedc ???)
Pavel Kus's avatar
Pavel Kus committed
639

Andreas Marek's avatar
Andreas Marek committed
640 641
  do i=1,nlen-1
    if (d(i+1)<d(i)) then
Pavel Kus's avatar
Pavel Kus committed
642
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
643
      if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk8) then
Pavel Kus's avatar
Pavel Kus committed
644
#else
Andreas Marek's avatar
Andreas Marek committed
645
      if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk4) then
Pavel Kus's avatar
Pavel Kus committed
646
#endif
Andreas Marek's avatar
Andreas Marek committed
647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674
        write(error_unit,'(a,i8,2g25.16)') '***WARNING: Monotony error dste**:',i+1,d(i),d(i+1)
      else
        write(error_unit,'(a,i8,2g25.16)') 'Info: Monotony error dste{dc,qr}:',i+1,d(i),d(i+1)
        write(error_unit,'(a)') 'The eigenvalues from a lapack call are not sorted to machine precision.'
        write(error_unit,'(a)') 'In this extent, this is completely harmless.'
        write(error_unit,'(a)') 'Still, we keep this info message just in case.'
      end if
      allocate(qtmp(nlen), stat=istat, errmsg=errorMessage)
      check_allocate("solve_tridi_single: qtmp", istat, errorMessage)

      dtmp = d(i+1)
      qtmp(1:nlen) = q(1:nlen,i+1)
      do j=i,1,-1
        if (dtmp<d(j)) then
          d(j+1)        = d(j)
          q(1:nlen,j+1) = q(1:nlen,j)
        else
          exit ! Loop
        endif
      enddo
      d(j+1)        = dtmp
      q(1:nlen,j+1) = qtmp(1:nlen)
      deallocate(qtmp, stat=istat, errmsg=errorMessage)
      check_deallocate("solve_tridi_single: qtmp", istat, errorMessage)

    endif
  enddo
  call obj%timer%stop("solve_tridi_single" // PRECISION_SUFFIX)
Pavel Kus's avatar
Pavel Kus committed
675

Andreas Marek's avatar
Andreas Marek committed
676 677
end subroutine solve_tridi_single_problem_&
&PRECISION_AND_SUFFIX
Pavel Kus's avatar
Pavel Kus committed
678