elpa2.F90 203 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
!    http://elpa.rzg.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".


#include "config-f90.h"

module ELPA2

! Version 1.1.2, 2011-02-21

65
  use elpa_utilities
66
  USE ELPA1
67
  use elpa2_utilities
68
69
  use elpa_pdgeqrf

70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
  implicit none

  PRIVATE ! By default, all routines contained are private

  ! The following routines are public:

  public :: solve_evp_real_2stage
  public :: solve_evp_complex_2stage

  public :: bandred_real
  public :: tridiag_band_real
  public :: trans_ev_tridi_to_band_real
  public :: trans_ev_band_to_full_real

  public :: bandred_complex
  public :: tridiag_band_complex
  public :: trans_ev_tridi_to_band_complex
  public :: trans_ev_band_to_full_complex
88

89
90
91
92
93
94
  public :: band_band_real
  public :: divide_band

  integer, public :: which_qr_decomposition = 1     ! defines, which QR-decomposition algorithm will be used
                                                    ! 0 for unblocked
                                                    ! 1 for blocked (maxrank: nblk)
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
!-------------------------------------------------------------------------------

  ! The following array contains the Householder vectors of the
  ! transformation band -> tridiagonal.
  ! It is allocated and set in tridiag_band_real and used in
  ! trans_ev_tridi_to_band_real.
  ! It must be deallocated by the user after trans_ev_tridi_to_band_real!

  real*8, allocatable :: hh_trans_real(:,:)
  complex*16, allocatable :: hh_trans_complex(:,:)

!-------------------------------------------------------------------------------

  include 'mpif.h'


!******
contains
113

114
function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk,        &
115
                               matrixCols,                               &
116
117
118
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
                                 useQR) result(success)
119
120
121
122
123
124
125
126
127
128

!-------------------------------------------------------------------------------
!  solve_evp_real_2stage: Solves the real eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
129
!  a(lda,matrixCols)    Distributed matrix for which eigenvalues are to be computed.
130
131
132
133
134
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
135
!  matrixCols  local columns of matrix a and q
136
137
138
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
139
!  q(ldq,matrixCols)    On output: Eigenvectors of a
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
155
156
157
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
158
   implicit none
159
160
   logical, intent(in), optional :: useQR
   logical                       :: useQRActual, useQREnvironment
Andreas Marek's avatar
Andreas Marek committed
161
   integer, intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
162
   integer                       :: THIS_REAL_ELPA_KERNEL
163

164
   integer, intent(in)           :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, &
165
                                    mpi_comm_cols, mpi_comm_all
166
   integer, intent(in)           :: nblk
167
   real*8, intent(inout)         :: a(lda,matrixCols), ev(na), q(ldq,matrixCols)
168

169
170
171
172
173
174
   integer                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer                       :: nbw, num_blocks
   real*8, allocatable           :: tmat(:,:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
   logical                       :: success
175
176
   logical, save                 :: firstCall = .true.
   logical                       :: wantDebug
177

178
179
180
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_real_2stage")
#endif
181
182
183
184
185
186
187
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
188

189
190
191
192
193
194
195
196

   wantDebug = .false.
   if (firstCall) then
     ! are debug messages desired?
     wantDebug = debug_messages_via_environment_variable()
     firstCall = .false.
   endif

197
198
   success = .true.

199
200
201
202
203
204
205
206
207
208
209
210
211
   useQRActual = .false.

   ! set usage of qr decomposition via API call
   if (present(useQR)) then
     if (useQR) useQRActual = .true.
     if (.not.(useQR)) useQRACtual = .false.
   endif

   ! overwrite this with environment variable settings
   if (qr_decomposition_via_environment_variable(useQREnvironment)) then
     useQRActual = useQREnvironment
   endif

212
   if (useQRActual) then
213
214
215
216
     if (mod(na,nblk) .ne. 0) then
       if (wantDebug) then
         write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
       endif
Andreas Marek's avatar
Andreas Marek committed
217
     print *, "Do not use QR-decomposition for this matrix and blocksize."
Andreas Marek's avatar
Andreas Marek committed
218
219
     success = .false.
     return
220
     endif
221
222
   endif

223

224
225
226
   if (present(THIS_REAL_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
227
   else
228

229
230
231
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
Andreas Marek's avatar
Andreas Marek committed
232
233
234
235
   endif

   ! check whether choosen kernel is allowed
   if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
236

237
238
239
240
241
242
243
244
245
246
247
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
248

249
250
251
252
       write(error_unit,*) " "
       write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !"
     endif
     THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
253
254

   endif
255
256
257
258
259
260
261
262
263
264
265
266
267

   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

   nbw = (31/nblk+1)*nblk

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
268
   call bandred_real(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
269
                     tmat, wantDebug, success, useQRActual)
270
   if (.not.(success)) return
271
   ttt1 = MPI_Wtime()
272
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
273
      write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
274
275
276
277
278
279

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
280
   call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, matrixCols, mpi_comm_rows, &
Andreas Marek's avatar
Andreas Marek committed
281
                          mpi_comm_cols, mpi_comm_all)
282
   ttt1 = MPI_Wtime()
283
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
284
      write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
285
286
287
288
289
290
291
292
293
294

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
295
   call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
296
                    mpi_comm_cols, wantDebug, success)
297
298
   if (.not.(success)) return

299
   ttt1 = MPI_Wtime()
300
301
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
     write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
302
303
304
305
306
307
308
309
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   deallocate(e)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
310
   call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, matrixCols, mpi_comm_rows, &
311
                                    mpi_comm_cols, wantDebug, success, THIS_REAL_ELPA_KERNEL)
312
   if (.not.(success)) return
313
   ttt1 = MPI_Wtime()
314
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
315
      write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
316
317
318
319
320
321
322

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_real)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
323
   call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, &
324
                                   mpi_comm_cols, useQRActual)
325
   ttt1 = MPI_Wtime()
326
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
327
      write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
328
329
330
   time_evp_back = ttt1-ttts

   deallocate(tmat)
331
332
333
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_real_2stage")
#endif
334
335
1  format(a,f10.3)

336
end function solve_evp_real_2stage
337
338
339
340
341

!-------------------------------------------------------------------------------

!-------------------------------------------------------------------------------

342
function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
343
                                  matrixCols, mpi_comm_rows, mpi_comm_cols,      &
344
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
345
346
347
348
349
350
351
352
353
354

!-------------------------------------------------------------------------------
!  solve_evp_complex_2stage: Solves the complex eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
355
!  a(lda,matrixCols)    Distributed matrix for which eigenvalues are to be computed.
356
357
358
359
360
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
361
!  matrixCols  local columns of matrix a and q
362
363
364
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
365
!  q(ldq,matrixCols)    On output: Eigenvectors of a
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
381
382
383
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
384
   implicit none
Andreas Marek's avatar
Andreas Marek committed
385
386
   integer, intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
   integer                       :: THIS_COMPLEX_ELPA_KERNEL
387
388
   integer, intent(in)           :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
   complex*16, intent(inout)     :: a(lda,matrixCols), q(ldq,matrixCols)
389
390
391
392
393
394
395
396
   real*8, intent(inout)         :: ev(na)

   integer                       :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes
   integer                       :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
   complex*16, allocatable       :: tmat(:,:,:)
   real*8, allocatable           :: q_real(:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
397

398
399
400
   logical                       :: success, wantDebug
   logical, save                 :: firstCall = .true.

401
402
403
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_complex_2stage")
#endif
Andreas Marek's avatar
Andreas Marek committed
404
405
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
406
407
408
409
410

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
411

412
413
414
415
416
417
418
419
   wantDebug = .false.
   if (firstCall) then
     ! are debug messages desired?
     wantDebug = debug_messages_via_environment_variable()
     firstCall = .false.
   endif


420
421
   success = .true.

422
423
424
   if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
425
   else
426
427
428
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel()
Andreas Marek's avatar
Andreas Marek committed
429
   endif
430

Andreas Marek's avatar
Andreas Marek committed
431
432
   ! check whether choosen kernel is allowed
   if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then
433

434
435
436
437
438
439
440
441
442
443
444
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
445

446
447
448
449
       write(error_unit,*) " "
       write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
     endif
     THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
450
   endif
451
452
453
454
455
456
457
458
459
460
461
462
   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

   nbw = (31/nblk+1)*nblk

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
463
   call bandred_complex(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
464
                        tmat, wantDebug, success)
465
466
467
468
469
470
   if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
     call timer%stop()
#endif
     return
   endif
471
   ttt1 = MPI_Wtime()
472
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
473
      write(error_unit,*) 'Time bandred_complex               :',ttt1-ttt0
474
475
476
477
478
479

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
480
   call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
481
   ttt1 = MPI_Wtime()
482
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
483
      write(error_unit,*) 'Time tridiag_band_complex          :',ttt1-ttt0
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
   l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev

   allocate(q_real(l_rows,l_cols))

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
500
   call solve_tridi(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, &
501
                    mpi_comm_rows, mpi_comm_cols, wantDebug, success)
502
503
   if (.not.(success)) return

504
   ttt1 = MPI_Wtime()
505
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times)  &
506
      write(error_unit,*) 'Time solve_tridi                   :',ttt1-ttt0
507
508
509
510
511
512
513
514
515
516
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)

   deallocate(e, q_real)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
517
   call trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq,  &
518
                                       matrixCols, mpi_comm_rows, mpi_comm_cols,&
519
                                       wantDebug, success,THIS_COMPLEX_ELPA_KERNEL)
520
   if (.not.(success)) return
521
   ttt1 = MPI_Wtime()
522
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
523
      write(error_unit,*) 'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0
524
525
526
527
528
529
530

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_complex)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
531
532
   call trans_ev_band_to_full_complex(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, &
                                      mpi_comm_rows, mpi_comm_cols)
533
   ttt1 = MPI_Wtime()
534
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
535
      write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0
536
537
538
   time_evp_back = ttt1-ttts

   deallocate(tmat)
539
540
541
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_complex_2stage")
#endif
542
543
544

1  format(a,f10.3)

545
end function solve_evp_complex_2stage
546
547
548

!-------------------------------------------------------------------------------

549
subroutine bandred_real(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
550
                        tmat, wantDebug, success, useQR)
551
552
553
554
555
556
557
558

!-------------------------------------------------------------------------------
!  bandred_real: Reduces a distributed symmetric matrix to band form
!
!  Parameters
!
!  na          Order of matrix
!
559
!  a(lda,matrixCols)    Distributed matrix which should be reduced.
560
561
562
563
564
565
!              Distribution is like in Scalapack.
!              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
!              a(:,:) is overwritten on exit with the band and the Householder vectors
!              in the upper half.
!
!  lda         Leading dimension of a
566
!  matrixCols  local columns of matrix a
567
568
569
570
571
572
573
574
575
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  nbw         semi bandwith of output matrix
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!
576
!  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
577
578
579
!              Factors for the Householder vectors (returned), needed for back transformation
!
!-------------------------------------------------------------------------------
580
581
582
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
583
   implicit none
584

585
586
   integer             :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
   real*8              :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
587

588
589
590
591
592
   integer             :: my_prow, my_pcol, np_rows, np_cols, mpierr
   integer             :: l_cols, l_rows
   integer             :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
   integer             :: istep, ncol, lch, lcx, nlc
   integer             :: tile_size, l_rows_tile, l_cols_tile
593

594
   real*8              :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
595

596
   real*8, allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:)
597

598
599
600
601
602
   ! needed for blocked QR decomposition
   integer             :: PQRPARAM(11), work_size
   real*8              :: dwork_size(1)
   real*8, allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)

603
   logical, intent(in) :: wantDebug
604
605
   logical, intent(out):: success

606
607
   logical, intent(in) :: useQR

608
609
610
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("bandred_real")
#endif
611
612
613
614
   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
615
   success = .true.
616
617


618
   ! Semibandwith nbw must be a multiple of blocksize nblk
619
620
   if (mod(nbw,nblk)/=0) then
     if (my_prow==0 .and. my_pcol==0) then
621
622
623
624
       if (wantDebug) then
         write(error_unit,*) 'ELPA2_bandred_real: ERROR: nbw=',nbw,', nblk=',nblk
         write(error_unit,*) 'ELPA2_bandred_real: ELPA2 works only for nbw==n*nblk'
       endif
625
       success = .false.
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
626
       return
627
     endif
628
629
630
631
632
633
634
635
636
637
   endif

   ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

   tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
   tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

   l_rows_tile = tile_size/np_rows ! local rows of a tile
   l_cols_tile = tile_size/np_cols ! local cols of a tile

638
639
640
641
642
643
644
   if (useQR) then
     if (which_qr_decomposition == 1) then
       call qr_pqrparam_init(pqrparam,    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
       allocate(tauvector(na))
       allocate(blockheuristic(nblk))
       l_rows = local_index(na, my_prow, np_rows, nblk, -1)
       allocate(vmr(max(l_rows,1),na))
645

646
       call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), tmat(1,1,1), nbw, dwork_size(1), -1, na, &
647
                             nbw, nblk, nblk, na, na, 1, 0, PQRPARAM, mpi_comm_rows, mpi_comm_cols, blockheuristic)
648
649
       work_size = dwork_size(1)
       allocate(work_blocked(work_size))
650

651
652
653
       work_blocked = 0.0d0
       deallocate(vmr)
     endif
654
655
   endif

656
657
   do istep = (na-1)/nbw, 1, -1

658
     n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
659

660
661
662
     ! Number of local columns/rows of remaining matrix
     l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
     l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
663

664
     ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
665

666
667
     allocate(vmr(max(l_rows,1),2*n_cols))
     allocate(umc(max(l_cols,1),2*n_cols))
668

669
     allocate(vr(l_rows+1))
670

671
672
673
     vmr(1:l_rows,1:n_cols) = 0.
     vr(:) = 0
     tmat(:,:,istep) = 0
674

675
     ! Reduce current block to lower triangular form
676
677
678
679
680
681
682
683
684
685

     if (useQR) then
       if (which_qr_decomposition == 1) then
         call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), &
                                  tmat(1,1,istep), nbw, work_blocked,       &
                                  work_size, na, n_cols, nblk, nblk,        &
                                  istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                  0, PQRPARAM, mpi_comm_rows, mpi_comm_cols,&
                                  blockheuristic)
       endif
686
     else
687

688
       do lc = n_cols, 1, -1
689

690
691
         ncol = istep*nbw + lc ! absolute column number of householder vector
         nrow = ncol - nbw ! Absolute number of pivot row
692

693
694
         lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
         lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
695

696
         tau = 0
697

698
         if (nrow == 1) exit ! Nothing to do
699

700
         cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
701

702
         if (my_pcol==cur_pcol) then
703

704
705
           ! Get vector to be transformed; distribute last element and norm of
           ! remaining elements to all procs in current column
706

707
           vr(1:lr) = a(1:lr,lch) ! vector to be transformed
708

709
           if (my_prow==prow(nrow, nblk, np_rows)) then
710
711
712
713
714
715
             aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
             aux1(2) = vr(lr)
           else
             aux1(1) = dot_product(vr(1:lr),vr(1:lr))
             aux1(2) = 0.
           endif
716

717
           call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
718

719
720
           vnorm2 = aux2(1)
           vrl    = aux2(2)
721

722
           ! Householder transformation
723

724
           call hh_transform_real(vrl, vnorm2, xf, tau)
725

726
           ! Scale vr and store Householder vector for back transformation
727

728
           vr(1:lr) = vr(1:lr) * xf
729
           if (my_prow==prow(nrow, nblk, np_rows)) then
730
731
732
733
734
             a(1:lr-1,lch) = vr(1:lr-1)
             a(lr,lch) = vrl
             vr(lr) = 1.
           else
             a(1:lr,lch) = vr(1:lr)
735
           endif
736

737
         endif
738

739
         ! Broadcast Householder vector and tau along columns
740

741
742
743
744
745
         vr(lr+1) = tau
         call MPI_Bcast(vr,lr+1,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
         vmr(1:lr,lc) = vr(1:lr)
         tau = vr(lr+1)
         tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
746

747
748
         ! Transform remaining columns in current block with Householder vector
         ! Local dot product
749

750
         aux1 = 0
751

752
753
754
755
756
757
758
759
         nlc = 0 ! number of local columns
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             nlc = nlc+1
             if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
           endif
         enddo
760

761
762
         ! Get global dot products
         if (nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
763

764
         ! Transform
765

766
767
768
769
770
771
772
773
774
775
         nlc = 0
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             nlc = nlc+1
             a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
           endif
         enddo

       enddo
776

777
778
       ! Calculate scalar products of stored Householder vectors.
       ! This can be done in different ways, we use dsyrk
779

780
781
       vav = 0
       if (l_rows>0) &
782
           call dsyrk('U','T',n_cols,l_rows,1.d0,vmr,ubound(vmr,dim=1),0.d0,vav,ubound(vav,dim=1))
783
       call symm_matrix_allreduce(n_cols,vav, nbw, nbw,mpi_comm_rows)
784

785
       ! Calculate triangular matrix T for block Householder Transformation
786

787
788
789
       do lc=n_cols,1,-1
         tau = tmat(lc,lc,istep)
         if (lc<n_cols) then
790
           call dtrmv('U','T','N',n_cols-lc,tmat(lc+1,lc+1,istep),ubound(tmat,dim=1),vav(lc+1,lc),1)
791
792
793
           tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
         endif
       enddo
794
     endif
795

796
    ! Transpose vmr -> vmc (stored in umc, second half)
797

798
799
    call elpa_transpose_vectors_real  (vmr, ubound(vmr,dim=1), mpi_comm_rows, &
                                    umc(1,n_cols+1), ubound(umc,dim=1), mpi_comm_cols, &
800
801
                                    1, istep*nbw, n_cols, nblk)

802
803
804
805
    ! Calculate umc = A**T * vmr
    ! Note that the distributed A has to be transposed
    ! Opposed to direct tridiagonalization there is no need to use the cache locality
    ! of the tiles, so we can use strips of the matrix
806

807
808
809
810
    umc(1:l_cols,1:n_cols) = 0.d0
    vmr(1:l_rows,n_cols+1:2*n_cols) = 0
    if (l_cols>0 .and. l_rows>0) then
      do i=0,(istep*nbw-1)/tile_size
811

812
813
814
        lcs = i*l_cols_tile+1
        lce = min(l_cols,(i+1)*l_cols_tile)
        if (lce<lcs) cycle
815

816
        lre = min(l_rows,(i+1)*l_rows_tile)
817
818
        call DGEMM('T','N',lce-lcs+1,n_cols,lre,1.d0,a(1,lcs),ubound(a,dim=1), &
                     vmr,ubound(vmr,dim=1),1.d0,umc(lcs,1),ubound(umc,dim=1))
819

820
821
822
        if (i==0) cycle
        lre = min(l_rows,i*l_rows_tile)
        call DGEMM('N','N',lre,n_cols,lce-lcs+1,1.d0,a(1,lcs),lda, &
823
                     umc(lcs,n_cols+1),ubound(umc,dim=1),1.d0,vmr(1,n_cols+1),ubound(vmr,dim=1))
824
825
      enddo
    endif
826

827
828
829
830
    ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
    ! on the processors containing the diagonal
    ! This is only necessary if ur has been calculated, i.e. if the
    ! global tile size is smaller than the global remaining matrix
831

832
    if (tile_size < istep*nbw) then
833
834
       call elpa_reduce_add_vectors_real  (vmr(1,n_cols+1),ubound(vmr,dim=1),mpi_comm_rows, &
                                      umc, ubound(umc,dim=1), mpi_comm_cols, &
835
836
                                      istep*nbw, n_cols, nblk)
    endif
837

838
839
840
841
842
843
    if (l_cols>0) then
      allocate(tmp(l_cols,n_cols))
      call mpi_allreduce(umc,tmp,l_cols*n_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
      umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
      deallocate(tmp)
    endif
844

845
    ! U = U * Tmat**T
846

847
    call dtrmm('Right','Upper','Trans','Nonunit',l_cols,n_cols,1.d0,tmat(1,1,istep),ubound(tmat,dim=1),umc,ubound(umc,dim=1))
848

849
    ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
850

Andreas Marek's avatar
Andreas Marek committed
851
852
853
854
    call dgemm('T','N',n_cols,n_cols,l_cols,1.d0,umc,ubound(umc,dim=1),umc(1,n_cols+1), &
               ubound(umc,dim=1),0.d0,vav,ubound(vav,dim=1))
    call dtrmm('Right','Upper','Trans','Nonunit',n_cols,n_cols,1.d0,tmat(1,1,istep),    &
               ubound(tmat,dim=1),vav,ubound(vav,dim=1))
855

856
    call symm_matrix_allreduce(n_cols,vav, nbw, nbw ,mpi_comm_cols)
857

858
    ! U = U - 0.5 * V * VAV
Andreas Marek's avatar
Andreas Marek committed
859
860
    call dgemm('N','N',l_cols,n_cols,n_cols,-0.5d0,umc(1,n_cols+1),ubound(umc,dim=1),vav, &
                ubound(vav,dim=1),1.d0,umc,ubound(umc,dim=1))
861

862
    ! Transpose umc -> umr (stored in vmr, second half)
863

864
865
    call elpa_transpose_vectors_real  (umc, ubound(umc,dim=1), mpi_comm_cols, &
                                   vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, &
866
                                   1, istep*nbw, n_cols, nblk)
867

868
    ! A = A - V*U**T - U*V**T
869

870
871
872
873
874
875
    do i=0,(istep*nbw-1)/tile_size
      lcs = i*l_cols_tile+1
      lce = min(l_cols,(i+1)*l_cols_tile)
      lre = min(l_rows,(i+1)*l_rows_tile)
      if (lce<lcs .or. lre<1) cycle
      call dgemm('N','T',lre,lce-lcs+1,2*n_cols,-1.d0, &
876
                  vmr,ubound(vmr,dim=1),umc(lcs,1),ubound(umc,dim=1), &
877
878
                  1.d0,a(1,lcs),lda)
    enddo
879

880
    deallocate(vmr, umc, vr)
881

882
  enddo
883

884
885
886
887
888
  if (useQR) then
    if (which_qr_decomposition == 1) then
      deallocate(work_blocked)
      deallocate(tauvector)
    endif
889
  endif
890

Andreas Marek's avatar
Andreas Marek committed
891
892
893
#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("bandred_real")
#endif
894
895
896
897
end subroutine bandred_real

!-------------------------------------------------------------------------------

898
subroutine symm_matrix_allreduce(n,a,lda,ldb,comm)
899
900
901
902
903
904

!-------------------------------------------------------------------------------
!  symm_matrix_allreduce: Does an mpi_allreduce for a symmetric matrix A.
!  On entry, only the upper half of A needs to be set
!  On exit, the complete matrix is set
!-------------------------------------------------------------------------------
Andreas Marek's avatar
Andreas Marek committed
905
906
907
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
908
   implicit none
909
910
   integer  :: n, lda, ldb, comm
   real*8   :: a(lda,ldb)
Andreas Marek's avatar
Andreas Marek committed
911
912
913

   integer  :: i, nc, mpierr
   real*8   :: h1(n*n), h2(n*n)
914

Andreas Marek's avatar
Andreas Marek committed
915
916
917
#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("symm_matrix_allreduce")
#endif
918
919
920

   nc = 0
   do i=1,n
921
922
     h1(nc+1:nc+i) = a(1:i,i)
     nc = nc+i
923
924
925
926
927
928
   enddo

   call mpi_allreduce(h1,h2,nc,MPI_REAL8,MPI_SUM,comm,mpierr)

   nc = 0
   do i=1,n
929
930
931
     a(1:i,i) = h2(nc+1:nc+i)
     a(i,1:i-1) = a(1:i-1,i)
     nc = nc+i
932
933
   enddo

Andreas Marek's avatar
Andreas Marek committed
934
935
936
937
#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("symm_matrix_allreduce")
#endif

938
939
940
941
end subroutine symm_matrix_allreduce

!-------------------------------------------------------------------------------

942
subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, numBlocks, mpi_comm_rows, &
943
                                      mpi_comm_cols, useQR)
944

Andreas Marek's avatar
Andreas Marek committed
945

946
947
948
949
950
951
952
953
954
955
956
957
958
959
!-------------------------------------------------------------------------------
!  trans_ev_band_to_full_real:
!  Transforms the eigenvectors of a band matrix back to the eigenvectors of the original matrix
!
!  Parameters
!
!  na          Order of matrix a, number of rows of matrix q
!
!  nqc         Number of columns of matrix q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  nbw         semi bandwith
!
960
!  a(lda,matrixCols)    Matrix containing the Householder vectors (i.e. matrix a after bandred_real)
961
962
963
!              Distribution is like in Scalapack.
!
!  lda         Leading dimension of a
964
!  matrixCols  local columns of matrix a and q
965
!
966
!  tmat(nbw,nbw,numBlocks) Factors returned by bandred_real
967
968
969
970
971
972
973
974
975
976
977
978
!
!  q           On input: Eigenvectors of band matrix
!              On output: Transformed eigenvectors
!              Distribution is like in Scalapack.
!
!  ldq         Leading dimension of q
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!
!-------------------------------------------------------------------------------
979
980
981
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
982
983
   implicit none

984
985
   integer              :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
   real*8               :: a(lda,matrixCols), q(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
986

987
988
989
990
991
   integer              :: my_prow, my_pcol, np_rows, np_cols, mpierr
   integer              :: max_blocks_row, max_blocks_col, max_local_rows, &
                           max_local_cols
   integer              :: l_cols, l_rows, l_colh, n_cols
   integer              :: istep, lc, ncol, nrow, nb, ns
992

993
   real*8, allocatable  :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
994

995
   integer              :: i
996
997

   real*8, allocatable  :: tmat_complete(:,:), t_tmp(:,:), t_tmp2(:,:)
998
999
1000
   integer              :: cwy_blocking, t_blocking, t_cols, t_rows
   logical, intent(in)  :: useQR

1001
1002
1003
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("trans_ev_band_to_full_real")
#endif
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)

   max_blocks_row = ((na -1)/nblk)/np_rows + 1  ! Rows of A
   max_blocks_col = ((nqc-1)/nblk)/np_cols + 1  ! Columns of q!

   max_local_rows = max_blocks_row*nblk
   max_local_cols = max_blocks_col*nblk

1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
   if (useQR) then
     t_blocking = 2 ! number of matrices T (tmat) which are aggregated into a new (larger) T matrix (tmat_complete) and applied at once
     cwy_blocking = t_blocking * nbw

     allocate(tmp1(max_local_cols*cwy_blocking))
     allocate(tmp2(max_local_cols*cwy_blocking))
     allocate(hvb(max_local_rows*cwy_blocking))
     allocate(hvm(max_local_rows,cwy_blocking))
     allocate(tmat_complete(cwy_blocking,cwy_blocking))
     allocate(t_tmp(cwy_blocking,nbw))
     allocate(t_tmp2(cwy_blocking,nbw))
   else
     allocate(tmp1(max_local_cols*nbw))
     allocate(tmp2(max_local_cols*nbw))
     allocate(hvb(max_local_rows*nbw))
     allocate(hvm(max_local_rows,nbw))
   endif
1033
1034
1035
1036
1037
1038

   hvm = 0   ! Must be set to 0 !!!
   hvb = 0   ! Safety only

   l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q

1039
   if (useQR) then
1040

1041
1042
     do istep=1,((na-1)/nbw-1)/t_blocking + 1
       n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step
1043

1044
       ! Broadcast all Householder vectors for current step compressed in hvb
1045

1046
1047
       nb = 0
       ns = 0
1048

1049
1050
1051
       do lc = 1, n_cols
         ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder vector
         nrow = ncol - nbw ! absolute number of pivot row
1052