elpa1_solve_tridi_real_template.F90 25 KB
Newer Older
Pavel Kus's avatar
Pavel Kus committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

55
#include "../general/sanity.F90"
56

57
subroutine solve_tridi_&
58
&PRECISION_AND_SUFFIX &
59
    ( obj, na, nev, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
Andreas Marek's avatar
Andreas Marek committed
60
                                           mpi_comm_cols, useGPU, wantDebug, success, max_threads )
Pavel Kus's avatar
Pavel Kus committed
61 62

      use precision
63
      use elpa_abstract_impl
Pavel Kus's avatar
Pavel Kus committed
64
      implicit none
65
#include "../../src/general/precision_kinds.F90"
66
      class(elpa_abstract_impl_t), intent(inout) :: obj
Andreas Marek's avatar
Andreas Marek committed
67 68
      integer(kind=ik), intent(in)               :: na, nev, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
      real(kind=REAL_DATATYPE), intent(inout)    :: d(na), e(na)
Pavel Kus's avatar
Pavel Kus committed
69
#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
70
      real(kind=REAL_DATATYPE), intent(inout)    :: q(ldq,*)
Pavel Kus's avatar
Pavel Kus committed
71
#else
Andreas Marek's avatar
Andreas Marek committed
72
      real(kind=REAL_DATATYPE), intent(inout)    :: q(ldq,matrixCols)
Pavel Kus's avatar
Pavel Kus committed
73
#endif
Andreas Marek's avatar
Andreas Marek committed
74 75
      logical, intent(in)                        :: useGPU, wantDebug
      logical, intent(out)                       :: success
Pavel Kus's avatar
Pavel Kus committed
76

Andreas Marek's avatar
Andreas Marek committed
77 78
      integer(kind=ik)                           :: i, j, n, np, nc, nev1, l_cols, l_rows
      integer(kind=ik)                           :: my_prow, my_pcol, np_rows, np_cols, mpierr
Pavel Kus's avatar
Pavel Kus committed
79

Andreas Marek's avatar
Andreas Marek committed
80
      integer(kind=ik), allocatable              :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:)
Pavel Kus's avatar
Pavel Kus committed
81

Andreas Marek's avatar
Andreas Marek committed
82 83 84 85
      integer(kind=ik)                           :: istat
      character(200)                             :: errorMessage
      character(20)                              :: gpuString
      integer(kind=ik), intent(in)               :: max_threads
Pavel Kus's avatar
Pavel Kus committed
86

87 88 89 90 91 92 93
      if(useGPU) then
        gpuString = "_gpu"
      else
        gpuString = ""
      endif

      call obj%timer%start("solve_tridi" // PRECISION_SUFFIX // gpuString)
94

95
      call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
96 97 98 99
      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
100
      call obj%timer%stop("mpi_communication")
101

Pavel Kus's avatar
Pavel Kus committed
102 103 104 105 106 107
      success = .true.

      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q

      ! Set Q to 0
108
      q(1:l_rows, 1:l_cols) = 0.0_rk
Pavel Kus's avatar
Pavel Kus committed
109 110 111 112 113 114 115

      ! Get the limits of the subdivisons, each subdivison has as many cols
      ! as fit on the respective processor column

      allocate(limits(0:np_cols), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"solve_tridi: error when allocating limits "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
116
        stop 1
Pavel Kus's avatar
Pavel Kus committed
117 118 119 120 121 122 123 124 125 126 127
      endif

      limits(0) = 0
      do np=0,np_cols-1
        nc = local_index(na, np, np_cols, nblk, -1) ! number of columns on proc column np

        ! Check for the case that a column has have zero width.
        ! This is not supported!
        ! Scalapack supports it but delivers no results for these columns,
        ! which is rather annoying
        if (nc==0) then
128
          call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX)
Pavel Kus's avatar
Pavel Kus committed
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
          if (wantDebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width'
          success = .false.
          return
        endif
        limits(np+1) = limits(np) + nc
      enddo

      ! Subdivide matrix by subtracting rank 1 modifications

      do i=1,np_cols-1
        n = limits(i)
        d(n) = d(n)-abs(e(n))
        d(n+1) = d(n+1)-abs(e(n))
      enddo

      ! Solve sub problems on processsor columns

      nc = limits(my_pcol) ! column after which my problem starts

      if (np_cols>1) then
        nev1 = l_cols ! all eigenvectors are needed
      else
        nev1 = MIN(nev,l_cols)
      endif
153
      call solve_tridi_col_&
154
           &PRECISION_AND_SUFFIX &
155
             (obj, l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk,  &
Andreas Marek's avatar
Andreas Marek committed
156
                        matrixCols, mpi_comm_rows, useGPU, wantDebug, success, max_threads)
Pavel Kus's avatar
Pavel Kus committed
157
      if (.not.(success)) then
Pavel Kus's avatar
Pavel Kus committed
158
        call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX // gpuString)
Pavel Kus's avatar
Pavel Kus committed
159 160 161 162 163 164 165 166
        return
      endif
      ! If there is only 1 processor column, we are done

      if (np_cols==1) then
        deallocate(limits, stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"solve_tridi: error when deallocating limits "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
167
          stop 1
Pavel Kus's avatar
Pavel Kus committed
168 169
        endif

Pavel Kus's avatar
Pavel Kus committed
170
        call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX // gpuString)
Pavel Kus's avatar
Pavel Kus committed
171 172 173 174 175 176 177 178 179 180
        return
      endif

      ! Set index arrays for Q columns

      ! Dense distribution scheme:

      allocate(l_col(na), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"solve_tridi: error when allocating l_col "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
181
        stop 1
Pavel Kus's avatar
Pavel Kus committed
182 183 184 185 186
      endif

      allocate(p_col(na), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"solve_tridi: error when allocating p_col "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
187
        stop 1
Pavel Kus's avatar
Pavel Kus committed
188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
      endif

      n = 0
      do np=0,np_cols-1
        nc = local_index(na, np, np_cols, nblk, -1)
        do i=1,nc
          n = n+1
          l_col(n) = i
          p_col(n) = np
        enddo
      enddo

      ! Block cyclic distribution scheme, only nev columns are set:

      allocate(l_col_bc(na), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"solve_tridi: error when allocating l_col_bc "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
205
        stop 1
Pavel Kus's avatar
Pavel Kus committed
206 207 208 209 210
      endif

      allocate(p_col_bc(na), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"solve_tridi: error when allocating p_col_bc "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
211
        stop 1
Pavel Kus's avatar
Pavel Kus committed
212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
      endif

      p_col_bc(:) = -1
      l_col_bc(:) = -1

      do i = 0, na-1, nblk*np_cols
        do j = 0, np_cols-1
          do n = 1, nblk
            if (i+j*nblk+n <= MIN(nev,na)) then
              p_col_bc(i+j*nblk+n) = j
              l_col_bc(i+j*nblk+n) = i/np_cols + n
             endif
           enddo
         enddo
      enddo

      ! Recursively merge sub problems
229
      call merge_recursive_&
Andreas Marek's avatar
cleanup  
Andreas Marek committed
230
           &PRECISION &
231
           (obj, 0, np_cols, useGPU, wantDebug, success)
Pavel Kus's avatar
Pavel Kus committed
232
      if (.not.(success)) then
Pavel Kus's avatar
Pavel Kus committed
233
        call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX // gpuString)
Pavel Kus's avatar
Pavel Kus committed
234 235 236 237 238 239
        return
      endif

      deallocate(limits,l_col,p_col,l_col_bc,p_col_bc, stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"solve_tridi: error when deallocating l_col "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
240
        stop 1
Pavel Kus's avatar
Pavel Kus committed
241 242
      endif

243
      call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX // gpuString)
Pavel Kus's avatar
Pavel Kus committed
244 245 246
      return

      contains
247
        recursive subroutine merge_recursive_&
Andreas Marek's avatar
cleanup  
Andreas Marek committed
248
                  &PRECISION &
249
           (obj, np_off, nprocs, useGPU, wantDebug, success)
Pavel Kus's avatar
Pavel Kus committed
250
           use precision
251
           use elpa_abstract_impl
Pavel Kus's avatar
Pavel Kus committed
252 253 254 255 256
           implicit none

           ! noff is always a multiple of nblk_ev
           ! nlen-noff is always > nblk_ev

257
           class(elpa_abstract_impl_t), intent(inout) :: obj
Pavel Kus's avatar
Pavel Kus committed
258 259 260 261 262
           integer(kind=ik)     :: np_off, nprocs
           integer(kind=ik)     :: np1, np2, noff, nlen, nmid, n
#ifdef WITH_MPI
!           integer(kind=ik)     :: my_mpi_status(mpi_status_size)
#endif
263
           logical, intent(in)  :: useGPU, wantDebug
Pavel Kus's avatar
Pavel Kus committed
264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
           logical, intent(out) :: success

           success = .true.

           if (nprocs<=1) then
             ! Safety check only
             if (wantDebug) write(error_unit,*) "ELPA1_merge_recursive: INTERNAL error merge_recursive: nprocs=",nprocs
             success = .false.
             return
           endif
           ! Split problem into 2 subproblems of size np1 / np2

           np1 = nprocs/2
           np2 = nprocs-np1

279
           if (np1 > 1) call merge_recursive_&
Andreas Marek's avatar
cleanup  
Andreas Marek committed
280
                        &PRECISION &
281
           (obj, np_off, np1, useGPU, wantDebug, success)
Pavel Kus's avatar
Pavel Kus committed
282
           if (.not.(success)) return
283
           if (np2 > 1) call merge_recursive_&
Andreas Marek's avatar
cleanup  
Andreas Marek committed
284
                        &PRECISION &
285
           (obj, np_off+np1, np2, useGPU, wantDebug, success)
Pavel Kus's avatar
Pavel Kus committed
286 287 288 289 290 291 292
           if (.not.(success)) return

           noff = limits(np_off)
           nmid = limits(np_off+np1) - noff
           nlen = limits(np_off+nprocs) - noff

#ifdef WITH_MPI
293
           call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
294 295
           if (my_pcol==np_off) then
             do n=np_off+np1,np_off+nprocs-1
296
               call mpi_send(d(noff+1), nmid, MPI_REAL_PRECISION, n, 1, mpi_comm_cols, mpierr)
Pavel Kus's avatar
Pavel Kus committed
297 298
             enddo
           endif
299
           call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
300 301 302 303
#endif /* WITH_MPI */

           if (my_pcol>=np_off+np1 .and. my_pcol<np_off+nprocs) then
#ifdef WITH_MPI
304
             call obj%timer%start("mpi_communication")
305
             call mpi_recv(d(noff+1), nmid, MPI_REAL_PRECISION, np_off, 1, mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
306
             call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
307 308 309 310 311 312 313 314
#else /* WITH_MPI */
!             d(noff+1:noff+1+nmid-1) = d(noff+1:noff+1+nmid-1)
#endif /* WITH_MPI */
           endif

           if (my_pcol==np_off+np1) then
             do n=np_off,np_off+np1-1
#ifdef WITH_MPI
315
               call obj%timer%start("mpi_communication")
316
               call mpi_send(d(noff+nmid+1), nlen-nmid, MPI_REAL_PRECISION, n, 1, mpi_comm_cols, mpierr)
317
               call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
318 319 320 321 322 323
#endif /* WITH_MPI */

             enddo
           endif
           if (my_pcol>=np_off .and. my_pcol<np_off+np1) then
#ifdef WITH_MPI
324
             call obj%timer%start("mpi_communication")
325
             call mpi_recv(d(noff+nmid+1), nlen-nmid, MPI_REAL_PRECISION, np_off+np1, 1,mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
326
             call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
327 328 329 330 331 332 333 334
#else /* WITH_MPI */
!             d(noff+nmid+1:noff+nmid+1+nlen-nmid-1) = d(noff+nmid+1:noff+nmid+1+nlen-nmid-1)
#endif /* WITH_MPI */
           endif
           if (nprocs == np_cols) then

             ! Last merge, result distribution must be block cyclic, noff==0,
             ! p_col_bc is set so that only nev eigenvalues are calculated
335
             call merge_systems_&
Andreas Marek's avatar
cleanup  
Andreas Marek committed
336
                  &PRECISION &
337
                                 (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
Pavel Kus's avatar
Pavel Kus committed
338
                                 nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col, p_col, &
Andreas Marek's avatar
Andreas Marek committed
339
                                 l_col_bc, p_col_bc, np_off, nprocs, useGPU, wantDebug, success, max_threads )
Pavel Kus's avatar
Pavel Kus committed
340 341 342
             if (.not.(success)) return
           else
             ! Not last merge, leave dense column distribution
343
             call merge_systems_&
Andreas Marek's avatar
cleanup  
Andreas Marek committed
344
                  &PRECISION &
345
                                (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
Pavel Kus's avatar
Pavel Kus committed
346
                                 nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col(noff+1), p_col(noff+1), &
Andreas Marek's avatar
Andreas Marek committed
347
                                 l_col(noff+1), p_col(noff+1), np_off, nprocs, useGPU, wantDebug, success, max_threads )
Pavel Kus's avatar
Pavel Kus committed
348 349
             if (.not.(success)) return
           endif
350
       end subroutine merge_recursive_&
Andreas Marek's avatar
cleanup  
Andreas Marek committed
351
           &PRECISION
Pavel Kus's avatar
Pavel Kus committed
352

353
    end subroutine solve_tridi_&
354
        &PRECISION_AND_SUFFIX
Pavel Kus's avatar
Pavel Kus committed
355

356
    subroutine solve_tridi_col_&
357
    &PRECISION_AND_SUFFIX &
Andreas Marek's avatar
Andreas Marek committed
358
      ( obj, na, nev, nqoff, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, useGPU, wantDebug, success, max_threads )
Pavel Kus's avatar
Pavel Kus committed
359 360 361 362 363

   ! Solves the symmetric, tridiagonal eigenvalue problem on one processor column
   ! with the divide and conquer method.
   ! Works best if the number of processor rows is a power of 2!
      use precision
364
      use elpa_abstract_impl
Pavel Kus's avatar
Pavel Kus committed
365
      implicit none
366
      class(elpa_abstract_impl_t), intent(inout) :: obj
Pavel Kus's avatar
Pavel Kus committed
367 368

      integer(kind=ik)              :: na, nev, nqoff, ldq, nblk, matrixCols, mpi_comm_rows
369
      real(kind=REAL_DATATYPE)      :: d(na), e(na)
Pavel Kus's avatar
Pavel Kus committed
370
#ifdef USE_ASSUMED_SIZE
371
      real(kind=REAL_DATATYPE)      :: q(ldq,*)
Pavel Kus's avatar
Pavel Kus committed
372
#else
373
      real(kind=REAL_DATATYPE)      :: q(ldq,matrixCols)
Pavel Kus's avatar
Pavel Kus committed
374 375 376 377 378 379 380 381 382 383
#endif

      integer(kind=ik), parameter   :: min_submatrix_size = 16 ! Minimum size of the submatrices to be used

      real(kind=REAL_DATATYPE), allocatable    :: qmat1(:,:), qmat2(:,:)
      integer(kind=ik)              :: i, n, np
      integer(kind=ik)              :: ndiv, noff, nmid, nlen, max_size
      integer(kind=ik)              :: my_prow, np_rows, mpierr

      integer(kind=ik), allocatable :: limits(:), l_col(:), p_col_i(:), p_col_o(:)
384
      logical, intent(in)           :: useGPU, wantDebug
Pavel Kus's avatar
Pavel Kus committed
385 386 387 388
      logical, intent(out)          :: success
      integer(kind=ik)              :: istat
      character(200)                :: errorMessage

Andreas Marek's avatar
Andreas Marek committed
389 390
      integer(kind=ik), intent(in)  :: max_threads

391 392
      call obj%timer%start("solve_tridi_col" // PRECISION_SUFFIX)
      call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
393 394
      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
395
      call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415
      success = .true.
      ! Calculate the number of subdivisions needed.

      n = na
      ndiv = 1
      do while(2*ndiv<=np_rows .and. n>2*min_submatrix_size)
        n = ((n+3)/4)*2 ! the bigger one of the two halves, we want EVEN boundaries
        ndiv = ndiv*2
      enddo

      ! If there is only 1 processor row and not all eigenvectors are needed
      ! and the matrix size is big enough, then use 2 subdivisions
      ! so that merge_systems is called once and only the needed
      ! eigenvectors are calculated for the final problem.

      if (np_rows==1 .and. nev<na .and. na>2*min_submatrix_size) ndiv = 2

      allocate(limits(0:ndiv), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"solve_tridi_col: error when allocating limits "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
416
        stop 1
Pavel Kus's avatar
Pavel Kus committed
417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452
      endif

      limits(0) = 0
      limits(ndiv) = na

      n = ndiv
      do while(n>1)
        n = n/2 ! n is always a power of 2
        do i=0,ndiv-1,2*n
          ! We want to have even boundaries (for cache line alignments)
          limits(i+n) = limits(i) + ((limits(i+2*n)-limits(i)+3)/4)*2
        enddo
      enddo

      ! Calculate the maximum size of a subproblem

      max_size = 0
      do i=1,ndiv
        max_size = MAX(max_size,limits(i)-limits(i-1))
      enddo

      ! Subdivide matrix by subtracting rank 1 modifications

      do i=1,ndiv-1
        n = limits(i)
        d(n) = d(n)-abs(e(n))
        d(n+1) = d(n+1)-abs(e(n))
      enddo

      if (np_rows==1)    then

        ! For 1 processor row there may be 1 or 2 subdivisions
        do n=0,ndiv-1
          noff = limits(n)        ! Start of subproblem
          nlen = limits(n+1)-noff ! Size of subproblem

453
          call solve_tridi_single_problem_&
454
          &PRECISION_AND_SUFFIX &
455
                                  (obj, nlen,d(noff+1),e(noff+1), &
Pavel Kus's avatar
Pavel Kus committed
456
                                    q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantDebug, success)
457

Pavel Kus's avatar
Pavel Kus committed
458 459 460 461 462 463 464 465 466 467 468
          if (.not.(success)) return
        enddo

      else

        ! Solve sub problems in parallel with solve_tridi_single
        ! There is at maximum 1 subproblem per processor

        allocate(qmat1(max_size,max_size), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"solve_tridi_col: error when allocating qmat1 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
469
          stop 1
Pavel Kus's avatar
Pavel Kus committed
470 471 472 473 474
        endif

        allocate(qmat2(max_size,max_size), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"solve_tridi_col: error when allocating qmat2 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
475
          stop 1
Pavel Kus's avatar
Pavel Kus committed
476 477 478 479 480 481 482 483
        endif

        qmat1 = 0 ! Make sure that all elements are defined

        if (my_prow < ndiv) then

          noff = limits(my_prow)        ! Start of subproblem
          nlen = limits(my_prow+1)-noff ! Size of subproblem
484
          call solve_tridi_single_problem_&
485
          &PRECISION_AND_SUFFIX &
486
                                    (obj, nlen,d(noff+1),e(noff+1),qmat1, &
Pavel Kus's avatar
Pavel Kus committed
487
                                    ubound(qmat1,dim=1), wantDebug, success)
488

Pavel Kus's avatar
Pavel Kus committed
489 490 491 492 493 494 495 496 497 498
          if (.not.(success)) return
        endif

        ! Fill eigenvectors in qmat1 into global matrix q

        do np = 0, ndiv-1

          noff = limits(np)
          nlen = limits(np+1)-noff
#ifdef WITH_MPI
499
          call obj%timer%start("mpi_communication")
500
          call MPI_Bcast(d(noff+1), nlen, MPI_REAL_PRECISION, np, mpi_comm_rows, mpierr)
Pavel Kus's avatar
Pavel Kus committed
501
          qmat2 = qmat1
502
          call MPI_Bcast(qmat2, max_size*max_size, MPI_REAL_PRECISION, np, mpi_comm_rows, mpierr)
503
          call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
504 505 506 507 508 509
#else /* WITH_MPI */
!          qmat2 = qmat1 ! is this correct
#endif /* WITH_MPI */
          do i=1,nlen

#ifdef WITH_MPI
510
            call distribute_global_column_&
Andreas Marek's avatar
cleanup  
Andreas Marek committed
511
            &PRECISION &
512
                     (obj, qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
Pavel Kus's avatar
Pavel Kus committed
513
#else /* WITH_MPI */
514
            call distribute_global_column_&
Andreas Marek's avatar
cleanup  
Andreas Marek committed
515
            &PRECISION &
516
                     (obj, qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
Pavel Kus's avatar
Pavel Kus committed
517 518 519 520 521 522 523 524
#endif /* WITH_MPI */
          enddo

        enddo

        deallocate(qmat1, qmat2, stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"solve_tridi_col: error when deallocating qmat2 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
525
          stop 1
Pavel Kus's avatar
Pavel Kus committed
526 527 528 529 530 531 532 533 534
        endif

      endif

      ! Allocate and set index arrays l_col and p_col

      allocate(l_col(na), p_col_i(na),  p_col_o(na), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"solve_tridi_col: error when allocating l_col "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
535
        stop 1
Pavel Kus's avatar
Pavel Kus committed
536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558
      endif

      do i=1,na
        l_col(i) = i
        p_col_i(i) = 0
        p_col_o(i) = 0
      enddo

      ! Merge subproblems

      n = 1
      do while(n<ndiv) ! if ndiv==1, the problem was solved by single call to solve_tridi_single

        do i=0,ndiv-1,2*n

          noff = limits(i)
          nmid = limits(i+n) - noff
          nlen = limits(i+2*n) - noff

          if (nlen == na) then
            ! Last merge, set p_col_o=-1 for unneeded (output) eigenvectors
            p_col_o(nev+1:na) = -1
          endif
559
          call merge_systems_&
Andreas Marek's avatar
cleanup  
Andreas Marek committed
560
          &PRECISION &
561
                              (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, &
Pavel Kus's avatar
Pavel Kus committed
562
                               matrixCols, mpi_comm_rows, mpi_comm_self, l_col(noff+1), p_col_i(noff+1), &
Andreas Marek's avatar
Andreas Marek committed
563
                               l_col(noff+1), p_col_o(noff+1), 0, 1, useGPU, wantDebug, success, max_threads)
Pavel Kus's avatar
Pavel Kus committed
564 565 566 567 568 569 570 571 572 573 574
          if (.not.(success)) return

        enddo

        n = 2*n

      enddo

      deallocate(limits, l_col, p_col_i, p_col_o, stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"solve_tridi_col: error when deallocating l_col "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
575
        stop 1
Pavel Kus's avatar
Pavel Kus committed
576 577
      endif

578
      call obj%timer%stop("solve_tridi_col" // PRECISION_SUFFIX)
Pavel Kus's avatar
Pavel Kus committed
579

580
    end subroutine solve_tridi_col_&
581
    &PRECISION_AND_SUFFIX
Pavel Kus's avatar
Pavel Kus committed
582

583
    recursive subroutine solve_tridi_single_problem_&
584
    &PRECISION_AND_SUFFIX &
585
    (obj, nlen, d, e, q, ldq, wantDebug, success)
Pavel Kus's avatar
Pavel Kus committed
586 587 588 589

   ! Solves the symmetric, tridiagonal eigenvalue problem on a single processor.
   ! Takes precautions if DSTEDC fails or if the eigenvalues are not ordered correctly.
     use precision
590
     use elpa_abstract_impl
Pavel Kus's avatar
Pavel Kus committed
591
     implicit none
592
     class(elpa_abstract_impl_t), intent(inout) :: obj
593
     integer(kind=ik)                         :: nlen, ldq
Pavel Kus's avatar
Pavel Kus committed
594 595 596 597 598
     real(kind=REAL_DATATYPE)                 :: d(nlen), e(nlen), q(ldq,nlen)

     real(kind=REAL_DATATYPE), allocatable    :: work(:), qtmp(:), ds(:), es(:)
     real(kind=REAL_DATATYPE)                 :: dtmp

Andreas Marek's avatar
Andreas Marek committed
599
     integer(kind=ik)              :: i, j, lwork, liwork, info
Pavel Kus's avatar
Pavel Kus committed
600 601 602 603 604 605 606
     integer(kind=ik), allocatable :: iwork(:)

     logical, intent(in)           :: wantDebug
     logical, intent(out)          :: success
      integer(kind=ik)             :: istat
      character(200)               :: errorMessage

607
     call obj%timer%start("solve_tridi_single" // PRECISION_SUFFIX)
Pavel Kus's avatar
Pavel Kus committed
608 609 610 611 612

     success = .true.
     allocate(ds(nlen), es(nlen), stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_tridi_single: error when allocating ds "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
613
       stop 1
Pavel Kus's avatar
Pavel Kus committed
614 615 616 617 618 619 620 621 622 623 624 625 626 627
     endif

     ! Save d and e for the case that dstedc fails

     ds(:) = d(:)
     es(:) = e(:)

     ! First try dstedc, this is normally faster but it may fail sometimes (why???)

     lwork = 1 + 4*nlen + nlen**2
     liwork =  3 + 5*nlen
     allocate(work(lwork), iwork(liwork), stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_tridi_single: error when allocating work "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
628
       stop 1
Pavel Kus's avatar
Pavel Kus committed
629
     endif
630
     call obj%timer%start("blas")
631
     call PRECISION_STEDC('I', nlen, d, e, q, ldq, work, lwork, iwork, liwork, info)
632
     call obj%timer%stop("blas")
Pavel Kus's avatar
Pavel Kus committed
633 634 635 636 637 638 639 640 641

     if (info /= 0) then

       ! DSTEDC failed, try DSTEQR. The workspace is enough for DSTEQR.

       write(error_unit,'(a,i8,a)') 'Warning: Lapack routine DSTEDC failed, info= ',info,', Trying DSTEQR!'

       d(:) = ds(:)
       e(:) = es(:)
642
       call obj%timer%start("blas")
643
       call PRECISION_STEQR('I', nlen, d, e, q, ldq, work, info)
644
       call obj%timer%stop("blas")
645

Pavel Kus's avatar
Pavel Kus committed
646 647 648 649 650 651 652 653 654 655 656 657 658
       ! If DSTEQR fails also, we don't know what to do further ...

       if (info /= 0) then
         if (wantDebug) &
           write(error_unit,'(a,i8,a)') 'ELPA1_solve_tridi_single: ERROR: Lapack routine DSTEQR failed, info= ',info,', Aborting!'
           success = .false.
           return
         endif
       end if

       deallocate(work,iwork,ds,es, stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"solve_tridi_single: error when deallocating ds "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
659
         stop 1
Pavel Kus's avatar
Pavel Kus committed
660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681
       endif

      ! Check if eigenvalues are monotonically increasing
      ! This seems to be not always the case  (in the IBM implementation of dstedc ???)

      do i=1,nlen-1
        if (d(i+1)<d(i)) then
#ifdef DOUBLE_PRECISION_REAL
          if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk8) then
#else
          if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk4) then
#endif
            write(error_unit,'(a,i8,2g25.16)') '***WARNING: Monotony error dste**:',i+1,d(i),d(i+1)
          else
            write(error_unit,'(a,i8,2g25.16)') 'Info: Monotony error dste{dc,qr}:',i+1,d(i),d(i+1)
            write(error_unit,'(a)') 'The eigenvalues from a lapack call are not sorted to machine precision.'
            write(error_unit,'(a)') 'In this extent, this is completely harmless.'
            write(error_unit,'(a)') 'Still, we keep this info message just in case.'
          end if
          allocate(qtmp(nlen), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"solve_tridi_single: error when allocating qtmp "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
682
            stop 1
Pavel Kus's avatar
Pavel Kus committed
683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699
          endif

          dtmp = d(i+1)
          qtmp(1:nlen) = q(1:nlen,i+1)
          do j=i,1,-1
            if (dtmp<d(j)) then
              d(j+1)        = d(j)
              q(1:nlen,j+1) = q(1:nlen,j)
            else
              exit ! Loop
            endif
          enddo
          d(j+1)        = dtmp
          q(1:nlen,j+1) = qtmp(1:nlen)
          deallocate(qtmp, stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"solve_tridi_single: error when deallocating qtmp "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
700
            stop 1
Pavel Kus's avatar
Pavel Kus committed
701 702 703 704
          endif

       endif
     enddo
705
     call obj%timer%stop("solve_tridi_single" // PRECISION_SUFFIX)
Pavel Kus's avatar
Pavel Kus committed
706

707
    end subroutine solve_tridi_single_problem_&
708
    &PRECISION_AND_SUFFIX
Pavel Kus's avatar
Pavel Kus committed
709