There is a maintenance of MPCDF Gitlab on Thursday, April 22st 2020, 9:00 am CEST - Expect some service interruptions during this time

elpa2.F90 203 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
!    http://elpa.rzg.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".


#include "config-f90.h"

module ELPA2

! Version 1.1.2, 2011-02-21

65
  use elpa_utilities
66
  USE ELPA1
67
  use elpa2_utilities
68 69
  use elpa_pdgeqrf

70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
  implicit none

  PRIVATE ! By default, all routines contained are private

  ! The following routines are public:

  public :: solve_evp_real_2stage
  public :: solve_evp_complex_2stage

  public :: bandred_real
  public :: tridiag_band_real
  public :: trans_ev_tridi_to_band_real
  public :: trans_ev_band_to_full_real

  public :: bandred_complex
  public :: tridiag_band_complex
  public :: trans_ev_tridi_to_band_complex
  public :: trans_ev_band_to_full_complex
88

89 90 91 92 93 94
  public :: band_band_real
  public :: divide_band

  integer, public :: which_qr_decomposition = 1     ! defines, which QR-decomposition algorithm will be used
                                                    ! 0 for unblocked
                                                    ! 1 for blocked (maxrank: nblk)
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
!-------------------------------------------------------------------------------

  ! The following array contains the Householder vectors of the
  ! transformation band -> tridiagonal.
  ! It is allocated and set in tridiag_band_real and used in
  ! trans_ev_tridi_to_band_real.
  ! It must be deallocated by the user after trans_ev_tridi_to_band_real!

  real*8, allocatable :: hh_trans_real(:,:)
  complex*16, allocatable :: hh_trans_complex(:,:)

!-------------------------------------------------------------------------------

  include 'mpif.h'


!******
contains
113

114
function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk,        &
115
                               matrixCols,                               &
116 117 118
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
                                 useQR) result(success)
119 120 121 122 123 124 125 126 127 128

!-------------------------------------------------------------------------------
!  solve_evp_real_2stage: Solves the real eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
129
!  a(lda,matrixCols)    Distributed matrix for which eigenvalues are to be computed.
130 131 132 133 134
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
135
!  matrixCols  local columns of matrix a and q
136 137 138
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
139
!  q(ldq,matrixCols)    On output: Eigenvectors of a
140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
155 156 157
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
158
   implicit none
159 160
   logical, intent(in), optional :: useQR
   logical                       :: useQRActual, useQREnvironment
Andreas Marek's avatar
Andreas Marek committed
161
   integer, intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
162
   integer                       :: THIS_REAL_ELPA_KERNEL
163

164
   integer, intent(in)           :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, &
165
                                    mpi_comm_cols, mpi_comm_all
166
   integer, intent(in)           :: nblk
167
   real*8, intent(inout)         :: a(lda,matrixCols), ev(na), q(ldq,matrixCols)
168

169 170 171 172 173 174
   integer                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer                       :: nbw, num_blocks
   real*8, allocatable           :: tmat(:,:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
   logical                       :: success
175 176
   logical, save                 :: firstCall = .true.
   logical                       :: wantDebug
177

178 179 180
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_real_2stage")
#endif
181 182 183 184 185 186 187
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
188

189 190 191 192 193 194 195 196

   wantDebug = .false.
   if (firstCall) then
     ! are debug messages desired?
     wantDebug = debug_messages_via_environment_variable()
     firstCall = .false.
   endif

197 198
   success = .true.

199 200 201 202 203 204 205 206 207 208 209 210 211
   useQRActual = .false.

   ! set usage of qr decomposition via API call
   if (present(useQR)) then
     if (useQR) useQRActual = .true.
     if (.not.(useQR)) useQRACtual = .false.
   endif

   ! overwrite this with environment variable settings
   if (qr_decomposition_via_environment_variable(useQREnvironment)) then
     useQRActual = useQREnvironment
   endif

212
   if (useQRActual) then
213 214 215 216
     if (mod(na,nblk) .ne. 0) then
       if (wantDebug) then
         write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
       endif
Andreas Marek's avatar
Andreas Marek committed
217
     print *, "Do not use QR-decomposition for this matrix and blocksize."
Andreas Marek's avatar
Andreas Marek committed
218 219
     success = .false.
     return
220
     endif
221 222
   endif

223

224 225 226
   if (present(THIS_REAL_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
227
   else
228

229 230 231
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
Andreas Marek's avatar
Andreas Marek committed
232 233 234 235
   endif

   ! check whether choosen kernel is allowed
   if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
236

237 238 239 240 241 242 243 244 245 246 247
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
248

249 250 251 252
       write(error_unit,*) " "
       write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !"
     endif
     THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
253 254

   endif
255 256 257 258 259 260 261 262 263 264 265 266 267

   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

   nbw = (31/nblk+1)*nblk

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
268
   call bandred_real(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
269
                     tmat, wantDebug, success, useQRActual)
270
   if (.not.(success)) return
271
   ttt1 = MPI_Wtime()
272
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
273
      write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
274 275 276 277 278 279

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
280
   call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, matrixCols, mpi_comm_rows, &
Andreas Marek's avatar
Andreas Marek committed
281
                          mpi_comm_cols, mpi_comm_all)
282
   ttt1 = MPI_Wtime()
283
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
284
      write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
285 286 287 288 289 290 291 292 293 294

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
295
   call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
296
                    mpi_comm_cols, wantDebug, success)
297 298
   if (.not.(success)) return

299
   ttt1 = MPI_Wtime()
300 301
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
     write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
302 303 304 305 306 307 308 309
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   deallocate(e)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
310
   call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, matrixCols, mpi_comm_rows, &
311
                                    mpi_comm_cols, wantDebug, success, THIS_REAL_ELPA_KERNEL)
312
   if (.not.(success)) return
313
   ttt1 = MPI_Wtime()
314
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
315
      write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
316 317 318 319 320 321 322

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_real)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
323
   call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, &
324
                                   mpi_comm_cols, useQRActual)
325
   ttt1 = MPI_Wtime()
326
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
327
      write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
328 329 330
   time_evp_back = ttt1-ttts

   deallocate(tmat)
331 332 333
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_real_2stage")
#endif
334 335
1  format(a,f10.3)

336
end function solve_evp_real_2stage
337 338 339 340 341

!-------------------------------------------------------------------------------

!-------------------------------------------------------------------------------

342
function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
343
                                  matrixCols, mpi_comm_rows, mpi_comm_cols,      &
344
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
345 346 347 348 349 350 351 352 353 354

!-------------------------------------------------------------------------------
!  solve_evp_complex_2stage: Solves the complex eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
355
!  a(lda,matrixCols)    Distributed matrix for which eigenvalues are to be computed.
356 357 358 359 360
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
361
!  matrixCols  local columns of matrix a and q
362 363 364
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
365
!  q(ldq,matrixCols)    On output: Eigenvectors of a
366 367 368 369 370 371 372 373 374 375 376 377 378 379 380
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
381 382 383
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
384
   implicit none
Andreas Marek's avatar
Andreas Marek committed
385 386
   integer, intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
   integer                       :: THIS_COMPLEX_ELPA_KERNEL
387 388
   integer, intent(in)           :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
   complex*16, intent(inout)     :: a(lda,matrixCols), q(ldq,matrixCols)
389 390 391 392 393 394 395 396
   real*8, intent(inout)         :: ev(na)

   integer                       :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes
   integer                       :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
   complex*16, allocatable       :: tmat(:,:,:)
   real*8, allocatable           :: q_real(:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
397

398 399 400
   logical                       :: success, wantDebug
   logical, save                 :: firstCall = .true.

401 402 403
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_complex_2stage")
#endif
Andreas Marek's avatar
Andreas Marek committed
404 405
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
406 407 408 409 410

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
411

412 413 414 415 416 417 418 419
   wantDebug = .false.
   if (firstCall) then
     ! are debug messages desired?
     wantDebug = debug_messages_via_environment_variable()
     firstCall = .false.
   endif


420 421
   success = .true.

422 423 424
   if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
425
   else
426 427 428
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel()
Andreas Marek's avatar
Andreas Marek committed
429
   endif
430

Andreas Marek's avatar
Andreas Marek committed
431 432
   ! check whether choosen kernel is allowed
   if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then
433

434 435 436 437 438 439 440 441 442 443 444
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
445

446 447 448 449
       write(error_unit,*) " "
       write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
     endif
     THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
450
   endif
451 452 453 454 455 456 457 458 459 460 461 462
   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

   nbw = (31/nblk+1)*nblk

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
463
   call bandred_complex(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
464
                        tmat, wantDebug, success)
465 466 467 468 469 470
   if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
     call timer%stop()
#endif
     return
   endif
471
   ttt1 = MPI_Wtime()
472
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
473
      write(error_unit,*) 'Time bandred_complex               :',ttt1-ttt0
474 475 476 477 478 479

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
480
   call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
481
   ttt1 = MPI_Wtime()
482
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
483
      write(error_unit,*) 'Time tridiag_band_complex          :',ttt1-ttt0
484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
   l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev

   allocate(q_real(l_rows,l_cols))

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
500
   call solve_tridi(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, &
501
                    mpi_comm_rows, mpi_comm_cols, wantDebug, success)
502 503
   if (.not.(success)) return

504
   ttt1 = MPI_Wtime()
505
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times)  &
506
      write(error_unit,*) 'Time solve_tridi                   :',ttt1-ttt0
507 508 509 510 511 512 513 514 515 516
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)

   deallocate(e, q_real)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
517
   call trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq,  &
518
                                       matrixCols, mpi_comm_rows, mpi_comm_cols,&
519
                                       wantDebug, success,THIS_COMPLEX_ELPA_KERNEL)
520
   if (.not.(success)) return
521
   ttt1 = MPI_Wtime()
522
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
523
      write(error_unit,*) 'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0
524 525 526 527 528 529 530

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_complex)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
531 532
   call trans_ev_band_to_full_complex(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, &
                                      mpi_comm_rows, mpi_comm_cols)
533
   ttt1 = MPI_Wtime()
534
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
535
      write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0
536 537 538
   time_evp_back = ttt1-ttts

   deallocate(tmat)
539 540 541
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_complex_2stage")
#endif
542 543 544

1  format(a,f10.3)

545
end function solve_evp_complex_2stage
546 547 548

!-------------------------------------------------------------------------------

549
subroutine bandred_real(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
550
                        tmat, wantDebug, success, useQR)
551 552 553 554 555 556 557 558

!-------------------------------------------------------------------------------
!  bandred_real: Reduces a distributed symmetric matrix to band form
!
!  Parameters
!
!  na          Order of matrix
!
559
!  a(lda,matrixCols)    Distributed matrix which should be reduced.
560 561 562 563 564 565
!              Distribution is like in Scalapack.
!              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
!              a(:,:) is overwritten on exit with the band and the Householder vectors
!              in the upper half.
!
!  lda         Leading dimension of a
566
!  matrixCols  local columns of matrix a
567 568 569 570 571 572 573 574 575
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  nbw         semi bandwith of output matrix
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!
576
!  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
577 578 579
!              Factors for the Householder vectors (returned), needed for back transformation
!
!-------------------------------------------------------------------------------
580 581 582
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
583
   implicit none
584

585 586
   integer             :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
   real*8              :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
587

588 589 590 591 592
   integer             :: my_prow, my_pcol, np_rows, np_cols, mpierr
   integer             :: l_cols, l_rows
   integer             :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
   integer             :: istep, ncol, lch, lcx, nlc
   integer             :: tile_size, l_rows_tile, l_cols_tile
593

594
   real*8              :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
595

596
   real*8, allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:)
597

598 599 600 601 602
   ! needed for blocked QR decomposition
   integer             :: PQRPARAM(11), work_size
   real*8              :: dwork_size(1)
   real*8, allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)

603
   logical, intent(in) :: wantDebug
604 605
   logical, intent(out):: success

606 607
   logical, intent(in) :: useQR

608 609 610
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("bandred_real")
#endif
611 612 613 614
   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
615
   success = .true.
616 617


618
   ! Semibandwith nbw must be a multiple of blocksize nblk
619 620
   if (mod(nbw,nblk)/=0) then
     if (my_prow==0 .and. my_pcol==0) then
621 622 623 624
       if (wantDebug) then
         write(error_unit,*) 'ELPA2_bandred_real: ERROR: nbw=',nbw,', nblk=',nblk
         write(error_unit,*) 'ELPA2_bandred_real: ELPA2 works only for nbw==n*nblk'
       endif
625
       success = .false.
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
626
       return
627
     endif
628 629 630 631 632 633 634 635 636 637
   endif

   ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

   tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
   tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

   l_rows_tile = tile_size/np_rows ! local rows of a tile
   l_cols_tile = tile_size/np_cols ! local cols of a tile

638 639 640 641 642 643 644
   if (useQR) then
     if (which_qr_decomposition == 1) then
       call qr_pqrparam_init(pqrparam,    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
       allocate(tauvector(na))
       allocate(blockheuristic(nblk))
       l_rows = local_index(na, my_prow, np_rows, nblk, -1)
       allocate(vmr(max(l_rows,1),na))
645

646
       call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), tmat(1,1,1), nbw, dwork_size(1), -1, na, &
647
                             nbw, nblk, nblk, na, na, 1, 0, PQRPARAM, mpi_comm_rows, mpi_comm_cols, blockheuristic)
648 649
       work_size = dwork_size(1)
       allocate(work_blocked(work_size))
650

651 652 653
       work_blocked = 0.0d0
       deallocate(vmr)
     endif
654 655
   endif

656 657
   do istep = (na-1)/nbw, 1, -1

658
     n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
659

660 661 662
     ! Number of local columns/rows of remaining matrix
     l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
     l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
663

664
     ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
665

666 667
     allocate(vmr(max(l_rows,1),2*n_cols))
     allocate(umc(max(l_cols,1),2*n_cols))
668

669
     allocate(vr(l_rows+1))
670

671 672 673
     vmr(1:l_rows,1:n_cols) = 0.
     vr(:) = 0
     tmat(:,:,istep) = 0
674

675
     ! Reduce current block to lower triangular form
676 677 678 679 680 681 682 683 684 685

     if (useQR) then
       if (which_qr_decomposition == 1) then
         call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), &
                                  tmat(1,1,istep), nbw, work_blocked,       &
                                  work_size, na, n_cols, nblk, nblk,        &
                                  istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                  0, PQRPARAM, mpi_comm_rows, mpi_comm_cols,&
                                  blockheuristic)
       endif
686
     else
687

688
       do lc = n_cols, 1, -1
689

690 691
         ncol = istep*nbw + lc ! absolute column number of householder vector
         nrow = ncol - nbw ! Absolute number of pivot row
692

693 694
         lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
         lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
695

696
         tau = 0
697

698
         if (nrow == 1) exit ! Nothing to do
699

700
         cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
701

702
         if (my_pcol==cur_pcol) then
703

704 705
           ! Get vector to be transformed; distribute last element and norm of
           ! remaining elements to all procs in current column
706

707
           vr(1:lr) = a(1:lr,lch) ! vector to be transformed
708

709
           if (my_prow==prow(nrow, nblk, np_rows)) then
710 711 712 713 714 715
             aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
             aux1(2) = vr(lr)
           else
             aux1(1) = dot_product(vr(1:lr),vr(1:lr))
             aux1(2) = 0.
           endif
716

717
           call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
718

719 720
           vnorm2 = aux2(1)
           vrl    = aux2(2)
721

722
           ! Householder transformation
723

724
           call hh_transform_real(vrl, vnorm2, xf, tau)
725

726
           ! Scale vr and store Householder vector for back transformation
727

728
           vr(1:lr) = vr(1:lr) * xf
729
           if (my_prow==prow(nrow, nblk, np_rows)) then
730 731 732 733 734
             a(1:lr-1,lch) = vr(1:lr-1)
             a(lr,lch) = vrl
             vr(lr) = 1.
           else
             a(1:lr,lch) = vr(1:lr)
735
           endif
736

737
         endif
738

739
         ! Broadcast Householder vector and tau along columns
740

741 742 743 744 745
         vr(lr+1) = tau
         call MPI_Bcast(vr,lr+1,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
         vmr(1:lr,lc) = vr(1:lr)
         tau = vr(lr+1)
         tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
746

747 748
         ! Transform remaining columns in current block with Householder vector
         ! Local dot product
749

750
         aux1 = 0
751

752 753 754 755 756 757 758 759
         nlc = 0 ! number of local columns
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             nlc = nlc+1
             if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
           endif
         enddo
760

761 762
         ! Get global dot products
         if (nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
763

764
         ! Transform
765

766 767 768 769 770 771 772 773 774 775
         nlc = 0
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             nlc = nlc+1
             a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
           endif
         enddo

       enddo
776

777 778
       ! Calculate scalar products of stored Householder vectors.
       ! This can be done in different ways, we use dsyrk
779

780 781
       vav = 0
       if (l_rows>0) &
782
           call dsyrk('U','T',n_cols,l_rows,1.d0,vmr,ubound(vmr,dim=1),0.d0,vav,ubound(vav,dim=1))
783
       call symm_matrix_allreduce(n_cols,vav, nbw, nbw,mpi_comm_rows)
784

785
       ! Calculate triangular matrix T for block Householder Transformation
786

787 788 789
       do lc=n_cols,1,-1
         tau = tmat(lc,lc,istep)
         if (lc<n_cols) then
790
           call dtrmv('U','T','N',n_cols-lc,tmat(lc+1,lc+1,istep),ubound(tmat,dim=1),vav(lc+1,lc),1)
791 792 793
           tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
         endif
       enddo
794
     endif
795

796
    ! Transpose vmr -> vmc (stored in umc, second half)
797

798 799
    call elpa_transpose_vectors_real  (vmr, ubound(vmr,dim=1), mpi_comm_rows, &
                                    umc(1,n_cols+1), ubound(umc,dim=1), mpi_comm_cols, &
800 801
                                    1, istep*nbw, n_cols, nblk)

802 803 804 805
    ! Calculate umc = A**T * vmr
    ! Note that the distributed A has to be transposed
    ! Opposed to direct tridiagonalization there is no need to use the cache locality
    ! of the tiles, so we can use strips of the matrix
806

807 808 809 810
    umc(1:l_cols,1:n_cols) = 0.d0
    vmr(1:l_rows,n_cols+1:2*n_cols) = 0
    if (l_cols>0 .and. l_rows>0) then
      do i=0,(istep*nbw-1)/tile_size
811

812 813 814
        lcs = i*l_cols_tile+1
        lce = min(l_cols,(i+1)*l_cols_tile)
        if (lce<lcs) cycle
815

816
        lre = min(l_rows,(i+1)*l_rows_tile)
817 818
        call DGEMM('T','N',lce-lcs+1,n_cols,lre,1.d0,a(1,lcs),ubound(a,dim=1), &
                     vmr,ubound(vmr,dim=1),1.d0,umc(lcs,1),ubound(umc,dim=1))
819

820 821 822
        if (i==0) cycle
        lre = min(l_rows,i*l_rows_tile)
        call DGEMM('N','N',lre,n_cols,lce-lcs+1,1.d0,a(1,lcs),lda, &
823
                     umc(lcs,n_cols+1),ubound(umc,dim=1),1.d0,vmr(1,n_cols+1),ubound(vmr,dim=1))
824 825
      enddo
    endif
826

827 828 829 830
    ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
    ! on the processors containing the diagonal
    ! This is only necessary if ur has been calculated, i.e. if the
    ! global tile size is smaller than the global remaining matrix
831

832
    if (tile_size < istep*nbw) then
833 834
       call elpa_reduce_add_vectors_real  (vmr(1,n_cols+1),ubound(vmr,dim=1),mpi_comm_rows, &
                                      umc, ubound(umc,dim=1), mpi_comm_cols, &
835 836
                                      istep*nbw, n_cols, nblk)
    endif
837

838 839 840 841 842 843
    if (l_cols>0) then
      allocate(tmp(l_cols,n_cols))
      call mpi_allreduce(umc,tmp,l_cols*n_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
      umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
      deallocate(tmp)
    endif
844

845
    ! U = U * Tmat**T
846

847
    call dtrmm('Right','Upper','Trans','Nonunit',l_cols,n_cols,1.d0,tmat(1,1,istep),ubound(tmat,dim=1),umc,ubound(umc,dim=1))
848

849
    ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
850

Andreas Marek's avatar
Andreas Marek committed
851 852 853 854
    call dgemm('T','N',n_cols,n_cols,l_cols,1.d0,umc,ubound(umc,dim=1),umc(1,n_cols+1), &
               ubound(umc,dim=1),0.d0,vav,ubound(vav,dim=1))
    call dtrmm('Right','Upper','Trans','Nonunit',n_cols,n_cols,1.d0,tmat(1,1,istep),    &
               ubound(tmat,dim=1),vav,ubound(vav,dim=1))
855

856
    call symm_matrix_allreduce(n_cols,vav, nbw, nbw ,mpi_comm_cols)
857

858
    ! U = U - 0.5 * V * VAV
Andreas Marek's avatar
Andreas Marek committed
859 860
    call dgemm('N','N',l_cols,n_cols,n_cols,-0.5d0,umc(1,n_cols+1),ubound(umc,dim=1),vav, &
                ubound(vav,dim=1),1.d0,umc,ubound(umc,dim=1))
861

862
    ! Transpose umc -> umr (stored in vmr, second half)
863

864 865
    call elpa_transpose_vectors_real  (umc, ubound(umc,dim=1), mpi_comm_cols, &
                                   vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, &
866
                                   1, istep*nbw, n_cols, nblk)
867

868
    ! A = A - V*U**T - U*V**T
869

870 871 872 873 874 875
    do i=0,(istep*nbw-1)/tile_size
      lcs = i*l_cols_tile+1
      lce = min(l_cols,(i+1)*l_cols_tile)
      lre = min(l_rows,(i+1)*l_rows_tile)
      if (lce<lcs .or. lre<1) cycle
      call dgemm('N','T',lre,lce-lcs+1,2*n_cols,-1.d0, &
876
                  vmr,ubound(vmr,dim=1),umc(lcs,1),ubound(umc,dim=1), &
877 878
                  1.d0,a(1,lcs),lda)
    enddo
879

880
    deallocate(vmr, umc, vr)
881

882
  enddo
883

884 885 886 887 888
  if (useQR) then
    if (which_qr_decomposition == 1) then
      deallocate(work_blocked)
      deallocate(tauvector)
    endif
889
  endif
890

Andreas Marek's avatar
Andreas Marek committed
891 892 893
#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("bandred_real")
#endif
894 895 896 897
end subroutine bandred_real

!-------------------------------------------------------------------------------

898
subroutine symm_matrix_allreduce(n,a,lda,ldb,comm)
899 900 901 902 903 904

!-------------------------------------------------------------------------------
!  symm_matrix_allreduce: Does an mpi_allreduce for a symmetric matrix A.
!  On entry, only the upper half of A needs to be set
!  On exit, the complete matrix is set
!-------------------------------------------------------------------------------
Andreas Marek's avatar
Andreas Marek committed
905 906 907
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
908
   implicit none
909 910
   integer  :: n, lda, ldb, comm
   real*8   :: a(lda,ldb)
Andreas Marek's avatar
Andreas Marek committed
911 912 913

   integer  :: i, nc, mpierr
   real*8   :: h1(n*n), h2(n*n)
914

Andreas Marek's avatar
Andreas Marek committed
915 916 917
#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("symm_matrix_allreduce")
#endif
918 919 920

   nc = 0
   do i=1,n
921 922
     h1(nc+1:nc+i) = a(1:i,i)
     nc = nc+i
923 924 925 926 927 928
   enddo

   call mpi_allreduce(h1,h2,nc,MPI_REAL8,MPI_SUM,comm,mpierr)

   nc = 0
   do i=1,n
929 930 931
     a(1:i,i) = h2(nc+1:nc+i)
     a(i,1:i-1) = a(1:i-1,i)
     nc = nc+i