elpa2.F90 216 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
!    http://elpa.rzg.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".


#include "config-f90.h"

module ELPA2

! Version 1.1.2, 2011-02-21

  USE ELPA1

67
68
69
#ifdef HAVE_ISO_FORTRAN_ENV
  use iso_fortran_env, only : error_unit
#endif
70
71
72
73
74

#ifdef WITH_QR
  use elpa_pdgeqrf
#endif

75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
  implicit none

  PRIVATE ! By default, all routines contained are private

  ! The following routines are public:

  public :: solve_evp_real_2stage
  public :: solve_evp_complex_2stage

  public :: bandred_real
  public :: tridiag_band_real
  public :: trans_ev_tridi_to_band_real
  public :: trans_ev_band_to_full_real

  public :: bandred_complex
  public :: tridiag_band_complex
  public :: trans_ev_tridi_to_band_complex
  public :: trans_ev_band_to_full_complex

Andreas Marek's avatar
Andreas Marek committed
94
95
96
97
98
99
100
101
102
103
104
105
  public :: get_actual_real_kernel_name, get_actual_complex_kernel_name
  public :: REAL_ELPA_KERNEL_GENERIC, REAL_ELPA_KERNEL_GENERIC_SIMPLE, &
            REAL_ELPA_KERNEL_BGP, REAL_ELPA_KERNEL_BGQ,                &
            REAL_ELPA_KERNEL_SSE, REAL_ELPA_KERNEL_AVX_BLOCK2,         &
            REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6

  public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, &
            COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ,                &
            COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_AVX_BLOCK1,         &
            COMPLEX_ELPA_KERNEL_AVX_BLOCK2

  public :: print_available_real_kernels, print_available_complex_kernels
106
107
108
109
#ifndef HAVE_ISO_FORTRAN_ENV
  integer, parameter :: error_unit = 6
#endif

Andreas Marek's avatar
Andreas Marek committed
110
111
112
113
114
115
116
117
118
119
120
121

  integer, parameter :: number_of_real_kernels           = 8
  integer, parameter :: REAL_ELPA_KERNEL_GENERIC         = 1
  integer, parameter :: REAL_ELPA_KERNEL_GENERIC_SIMPLE  = 2
  integer, parameter :: REAL_ELPA_KERNEL_BGP             = 3
  integer, parameter :: REAL_ELPA_KERNEL_BGQ             = 4
  integer, parameter :: REAL_ELPA_KERNEL_SSE             = 5
  integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK2      = 6
  integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK4      = 7
  integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK6      = 8

#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
122
  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
123
#else
124
  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
125
126
#endif
  character(35), parameter, dimension(number_of_real_kernels) :: &
127
128
129
130
131
132
133
134
  REAL_ELPA_KERNEL_NAMES =    (/"REAL_ELPA_KERNEL_GENERIC         ", &
                                "REAL_ELPA_KERNEL_GENERIC_SIMPLE  ", &
                                "REAL_ELPA_KERNEL_BGP             ", &
                                "REAL_ELPA_KERNEL_BGQ             ", &
                                "REAL_ELPA_KERNEL_SSE             ", &
                                "REAL_ELPA_KERNEL_AVX_BLOCK2      ", &
                                "REAL_ELPA_KERNEL_AVX_BLOCK4      ", &
                                "REAL_ELPA_KERNEL_AVX_BLOCK6      "/)
Andreas Marek's avatar
Andreas Marek committed
135
136
137
138
139
140
141
142
143
144
145

  integer, parameter :: number_of_complex_kernels           = 7
  integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC         = 1
  integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE  = 2
  integer, parameter :: COMPLEX_ELPA_KERNEL_BGP             = 3
  integer, parameter :: COMPLEX_ELPA_KERNEL_BGQ             = 4
  integer, parameter :: COMPLEX_ELPA_KERNEL_SSE             = 5
  integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK1      = 6
  integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK2      = 7

#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
146
  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
147
#else
148
  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
149
150
#endif
  character(35), parameter, dimension(number_of_complex_kernels) :: &
151
152
153
154
155
156
157
  COMPLEX_ELPA_KERNEL_NAMES = (/"COMPLEX_ELPA_KERNEL_GENERIC         ", &
                                "COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE  ", &
                                "COMPLEX_ELPA_KERNEL_BGP             ", &
                                "COMPLEX_ELPA_KERNEL_BGQ             ", &
                                "COMPLEX_ELPA_KERNEL_SSE             ", &
                                "COMPLEX_ELPA_KERNEL_AVX_BLOCK1      ", &
                                "COMPLEX_ELPA_KERNEL_AVX_BLOCK2      "/)
Andreas Marek's avatar
Andreas Marek committed
158
159
160
161
162
163
164

  integer, parameter                                    ::             &
           AVAILABLE_REAL_ELPA_KERNELS(number_of_real_kernels) =       &
                                      (/                               &
#if WITH_REAL_GENERIC_KERNEL
                                        1                              &
#else
165
                                        0                              &
Andreas Marek's avatar
Andreas Marek committed
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#endif
#if WITH_REAL_GENERIC_SIMPLE_KERNEL
                                          ,1                           &
#else
                                          ,0                           &
#endif
#if WITH_REAL_BGP_KERNEL
                                            ,1                         &
#else
                                            ,0                         &
#endif
#if WITH_REAL_BGQ_KERNEL
                                              ,1                       &
#else
                                              ,0                       &
#endif
#if WITH_REAL_SSE_KERNEL
                                                ,1                     &
#else
                                                ,0                     &
#endif
#if WITH_REAL_AVX_BLOCK2_KERNEL
                                                  ,1                   &
#else
                                                  ,0                   &
#endif
#if WITH_REAL_AVX_BLOCK4_KERNEL
                                                    ,1                 &
#else
                                                    ,0                 &
#endif
#if WITH_REAL_AVX_BLOCK6_KERNEL
                                                      ,1               &
#else
                                                      ,0               &
#endif
                                                       /)

  integer, parameter ::                                                   &
           AVAILABLE_COMPLEX_ELPA_KERNELS(number_of_complex_kernels) =    &
                                      (/                                  &
#if WITH_COMPLEX_GENERIC_KERNEL
                                        1                                 &
#else
                                        0                                 &
#endif
#if WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
                                          ,1                              &
#else
                                          ,0                              &
#endif
#if WITH_COMPLEX_BGP_KERNEL
                                            ,1                            &
#else
                                            ,0                            &
#endif
#if WITH_COMPLEX_BGQ_KERNEL
                                              ,1                          &
#else
                                              ,0                          &
#endif
#if WITH_COMPLEX_SSE_KERNEL
                                                ,1                        &
#else
                                                ,0                        &
#endif
232
#if WITH_COMPLEX_AVX_BLOCK1_KERNEL
Andreas Marek's avatar
Andreas Marek committed
233
234
235
236
                                                  ,1                      &
#else
                                                  ,0                      &
#endif
237
#if WITH_COMPLEX_AVX_BLOCK2_KERNEL
Andreas Marek's avatar
Andreas Marek committed
238
239
240
241
242
                                                    ,1                    &
#else
                                                    ,0                    &
#endif
                                                   /)
243
244
245
246
247
248
249
250
251
252

#ifdef WITH_QR
  public :: band_band_real
  public :: divide_band

  integer, public :: which_qr_decomposition = 1     ! defines, which QR-decomposition algorithm will be used
                                                    ! 0 for unblocked
                                                    ! 1 for blocked (maxrank: nblk)

#endif
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
!-------------------------------------------------------------------------------

  ! The following array contains the Householder vectors of the
  ! transformation band -> tridiagonal.
  ! It is allocated and set in tridiag_band_real and used in
  ! trans_ev_tridi_to_band_real.
  ! It must be deallocated by the user after trans_ev_tridi_to_band_real!

  real*8, allocatable :: hh_trans_real(:,:)
  complex*16, allocatable :: hh_trans_complex(:,:)

!-------------------------------------------------------------------------------

  include 'mpif.h'


!******
contains
Andreas Marek's avatar
Andreas Marek committed
271
272
273
274
275
subroutine print_available_real_kernels

  implicit none

  integer :: i
276

Andreas Marek's avatar
Andreas Marek committed
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
  do i=1, number_of_real_kernels
     if (AVAILABLE_REAL_ELPA_KERNELS(i) .eq. 1) then
        write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
     endif
  enddo
  write(error_unit,*) " "
  write(error_unit,*) " At the moment the following kernel would be choosen:"
  write(error_unit,*) get_actual_real_kernel_name()


end subroutine print_available_real_kernels

subroutine print_available_complex_kernels

  implicit none

  integer :: i
294

295
296
297
  do i=1, number_of_complex_kernels
     if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .eq. 1) then
        write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
Andreas Marek's avatar
Andreas Marek committed
298
299
300
301
     endif
  enddo
  write(error_unit,*) " "
  write(error_unit,*) " At the moment the following kernel would be choosen:"
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
302
  write(error_unit,*) get_actual_complex_kernel_name()
Andreas Marek's avatar
Andreas Marek committed
303
304
305
306


end subroutine print_available_complex_kernels

307
function get_actual_real_kernel() result(actual_kernel)
Andreas Marek's avatar
Andreas Marek committed
308
309

  integer :: actual_kernel
310

Andreas Marek's avatar
Andreas Marek committed
311
312
313
  ! if kernel is not choosen via api
  ! check whether set by environment variable
  actual_kernel = real_kernel_via_environment_variable()
314

Andreas Marek's avatar
Andreas Marek committed
315
316
317
318
319
320
  if (actual_kernel .eq. 0) then
     ! if not then set default kernel
     actual_kernel = DEFAULT_REAL_ELPA_KERNEL
  endif
end function get_actual_real_kernel

321
function get_actual_real_kernel_name() result(actual_kernel_name)
Andreas Marek's avatar
Andreas Marek committed
322
323
324
325
326
327
328

  character(35) :: actual_kernel_name
  integer       :: actual_kernel
  actual_kernel = get_actual_real_kernel()
  actual_kernel_name = REAL_ELPA_KERNEL_NAMES(actual_kernel)
end function get_actual_real_kernel_name

329
function get_actual_complex_kernel() result(actual_kernel)
Andreas Marek's avatar
Andreas Marek committed
330
331

  integer :: actual_kernel
332

Andreas Marek's avatar
Andreas Marek committed
333
334
335
  ! if kernel is not choosen via api
  ! check whether set by environment variable
  actual_kernel = complex_kernel_via_environment_variable()
336

Andreas Marek's avatar
Andreas Marek committed
337
338
339
340
341
342
  if (actual_kernel .eq. 0) then
     ! if not then set default kernel
     actual_kernel = DEFAULT_COMPLEX_ELPA_KERNEL
  endif
end function get_actual_complex_kernel

343
function get_actual_complex_kernel_name() result(actual_kernel_name)
Andreas Marek's avatar
Andreas Marek committed
344
345
346
347
348
349
350
351
352
353
354

  character(35) :: actual_kernel_name
  integer       :: actual_kernel
  actual_kernel = get_actual_complex_kernel()
  actual_kernel_name = COMPLEX_ELPA_KERNEL_NAMES(actual_kernel)
end function get_actual_complex_kernel_name

function check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL) result(err)

  implicit none
  integer, intent(in) :: THIS_REAL_ELPA_KERNEL
355

Andreas Marek's avatar
Andreas Marek committed
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
  logical             :: err

  err = .false.

  if (AVAILABLE_REAL_ELPA_KERNELS(THIS_REAL_ELPA_KERNEL) .ne. 1) err=.true.

end function check_allowed_real_kernels

function check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL) result(err)

  implicit none
  integer, intent(in) :: THIS_COMPLEX_ELPA_KERNEL

  logical             :: err

  err = .false.

  if (AVAILABLE_COMPLEX_ELPA_KERNELS(THIS_COMPLEX_ELPA_KERNEL) .ne. 1) err=.true.
end function check_allowed_complex_kernels

376
function real_kernel_via_environment_variable() result(kernel)
Andreas Marek's avatar
Andreas Marek committed
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
  implicit none
  integer :: kernel
  CHARACTER(len=255) :: REAL_KERNEL_ENVIRONMENT
  integer :: i

#if defined(HAVE_ENVIRONMENT_CHECKING)
  call get_environment_variable("REAL_ELPA_KERNEL",REAL_KERNEL_ENVIRONMENT)
#endif
  do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
!     if (trim(dummy_char) .eq. trim(REAL_ELPA_KERNEL_NAMES(i))) then
     if (trim(REAL_KERNEL_ENVIRONMENT) .eq. trim(REAL_ELPA_KERNEL_NAMES(i))) then
        kernel = i
        exit
     else
        kernel = 0
     endif
  enddo


end function real_kernel_via_environment_variable

398
function complex_kernel_via_environment_variable() result(kernel)
Andreas Marek's avatar
Andreas Marek committed
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
  implicit none
  integer :: kernel

  CHARACTER(len=255) :: COMPLEX_KERNEL_ENVIRONMENT
  integer :: i
#if defined(HAVE_ENVIRONMENT_CHECKING)
  call get_environment_variable("COMPLEX_ELPA_KERNEL",COMPLEX_KERNEL_ENVIRONMENT)
#endif

  do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
     if (trim(COMPLEX_ELPA_KERNEL_NAMES(i)) .eq. trim(COMPLEX_KERNEL_ENVIRONMENT)) then
        kernel = i
        exit
     else
        kernel = 0
     endif
  enddo

end function complex_kernel_via_environment_variable

419
function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk,   &
Andreas Marek's avatar
Andreas Marek committed
420
                                 mpi_comm_rows, mpi_comm_cols,        &
421
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API) result(success)
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456

!-------------------------------------------------------------------------------
!  solve_evp_real_2stage: Solves the real eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
!  a(lda,*)    Distributed matrix for which eigenvalues are to be computed.
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
!  q(ldq,*)    On output: Eigenvectors of a
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
457
458
459
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
460
   implicit none
Andreas Marek's avatar
Andreas Marek committed
461
   integer, intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
462
   integer                       :: THIS_REAL_ELPA_KERNEL
463

464
465
466
   integer, intent(in)           :: na, nev, lda, ldq, nblk, mpi_comm_rows, &
                                    mpi_comm_cols, mpi_comm_all
   real*8, intent(inout)         :: a(lda,*), ev(na), q(ldq,*)
467

468
469
470
471
472
473
   integer                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer                       :: nbw, num_blocks
   real*8, allocatable           :: tmat(:,:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
   logical                       :: success
474
475
476
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_real_2stage")
#endif
477
478
479
480
481
482
483
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
484

485
486
   success = .true.

Andreas Marek's avatar
Andreas Marek committed
487
488
489
490
 if (present(THIS_REAL_ELPA_KERNEL_API)) then
      ! user defined kernel via the optional argument in the API call
      THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
   else
491

Andreas Marek's avatar
Andreas Marek committed
492
493
494
495
496
497
498
      ! if kernel is not choosen via api
      ! check whether set by environment variable
      THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
   endif

   ! check whether choosen kernel is allowed
   if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
499

Andreas Marek's avatar
Andreas Marek committed
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
      if(my_pe == 0) then
         write(error_unit,*) " "
         write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
         write(error_unit,*) "is not in the list of the allowed kernels!"
         write(error_unit,*) " "
         write(error_unit,*) "Allowed kernels are:"
         do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
            if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
               write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
            endif
         enddo

         write(error_unit,*) " "
         write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !"
      endif
         THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC

   endif
518
519
520
521
522
523
524
525
526
527
528
529
530

   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

   nbw = (31/nblk+1)*nblk

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
531
532
533
   call bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
                     tmat, success)
   if (.not.(success)) return
534
535
   ttt1 = MPI_Wtime()
   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
536
      write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
537
538
539
540
541
542

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
543
544
   call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, mpi_comm_rows, &
                          mpi_comm_cols, mpi_comm_all)
545
546
   ttt1 = MPI_Wtime()
   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
547
      write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
548
549
550
551
552
553
554
555
556
557

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
558
559
560
561
   call solve_tridi(na, nev, ev, e, q, ldq, nblk, mpi_comm_rows,  &
                    mpi_comm_cols, success)
   if (.not.(success)) return

562
563
   ttt1 = MPI_Wtime()
   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
564
      write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
565
566
567
568
569
570
571
572
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   deallocate(e)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
573
574
575
   call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows, &
                                    mpi_comm_cols, success, THIS_REAL_ELPA_KERNEL)
   if (.not.(success)) return
576
577
   ttt1 = MPI_Wtime()
   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
578
      write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
579
580
581
582
583
584
585
586
587
588

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_real)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
   call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, mpi_comm_rows, mpi_comm_cols)
   ttt1 = MPI_Wtime()
   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
589
      write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
590
591
592
   time_evp_back = ttt1-ttts

   deallocate(tmat)
593
594
595
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_real_2stage")
#endif
596
597
1  format(a,f10.3)

598
end function solve_evp_real_2stage
599
600
601
602
603

!-------------------------------------------------------------------------------

!-------------------------------------------------------------------------------

604
function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
Andreas Marek's avatar
Andreas Marek committed
605
                                    mpi_comm_rows, mpi_comm_cols,      &
606
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641

!-------------------------------------------------------------------------------
!  solve_evp_complex_2stage: Solves the complex eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
!  a(lda,*)    Distributed matrix for which eigenvalues are to be computed.
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
!  q(ldq,*)    On output: Eigenvectors of a
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
642
643
644
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
645
   implicit none
Andreas Marek's avatar
Andreas Marek committed
646
647
   integer, intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
   integer                       :: THIS_COMPLEX_ELPA_KERNEL
648
649
650
651
652
653
654
655
656
657
   integer, intent(in)           :: na, nev, lda, ldq, nblk, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
   complex*16, intent(inout)     :: a(lda,*), q(ldq,*)
   real*8, intent(inout)         :: ev(na)

   integer                       :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes
   integer                       :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
   complex*16, allocatable       :: tmat(:,:,:)
   real*8, allocatable           :: q_real(:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
658

659
   logical                       :: success
660
661
662
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_complex_2stage")
#endif
Andreas Marek's avatar
Andreas Marek committed
663
664
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
665
666
667
668
669

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
670
671
672

   success = .true.

Andreas Marek's avatar
Andreas Marek committed
673
674
675
676
677
678
679
680
  if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then
      ! user defined kernel via the optional argument in the API call
      THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API
   else
      ! if kernel is not choosen via api
      ! check whether set by environment variable
      THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel()
   endif
681

Andreas Marek's avatar
Andreas Marek committed
682
683
   ! check whether choosen kernel is allowed
   if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then
684

Andreas Marek's avatar
Andreas Marek committed
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
      if(my_pe == 0) then
         write(error_unit,*) " "
         write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL)
         write(error_unit,*) "is not in the list of the allowed kernels!"
         write(error_unit,*) " "
         write(error_unit,*) "Allowed kernels are:"
         do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
            if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then
               write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
            endif
         enddo

         write(error_unit,*) " "
         write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
      endif
         THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
!      call MPI_ABORT(mpi_comm_all, mpierr)
   endif
703
704
705
706
707
708
709
710
711
712
713
714
   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

   nbw = (31/nblk+1)*nblk

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
715
716
   call bandred_complex(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
                        tmat, success)
717
718
719
720
721
722
   if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
     call timer%stop()
#endif
     return
   endif
723
724
   ttt1 = MPI_Wtime()
   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
725
      write(error_unit,*) 'Time bandred_complex               :',ttt1-ttt0
726
727
728
729
730
731
732
733
734

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
   call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
   ttt1 = MPI_Wtime()
   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
735
      write(error_unit,*) 'Time tridiag_band_complex          :',ttt1-ttt0
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
   l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev

   allocate(q_real(l_rows,l_cols))

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
752
753
754
755
   call solve_tridi(na, nev, ev, e, q_real, ubound(q_real,1), nblk, &
                    mpi_comm_rows, mpi_comm_cols, success)
   if (.not.(success)) return

756
757
   ttt1 = MPI_Wtime()
   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times)  &
758
      write(error_unit,*) 'Time solve_tridi                   :',ttt1-ttt0
759
760
761
762
763
764
765
766
767
768
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)

   deallocate(e, q_real)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
769
   call trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq,  &
770
771
772
                                       mpi_comm_rows, mpi_comm_cols,&
                                       success,THIS_COMPLEX_ELPA_KERNEL)
   if (.not.(success)) return
773
774
   ttt1 = MPI_Wtime()
   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
775
      write(error_unit,*) 'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0
776
777
778
779
780
781
782
783
784
785

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_complex)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
   call trans_ev_band_to_full_complex(na, nev, nblk, nbw, a, lda, tmat, q, ldq, mpi_comm_rows, mpi_comm_cols)
   ttt1 = MPI_Wtime()
   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
786
      write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0
787
788
789
   time_evp_back = ttt1-ttts

   deallocate(tmat)
790
791
792
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_complex_2stage")
#endif
793
794
795

1  format(a,f10.3)

796
end function solve_evp_complex_2stage
797
798
799

!-------------------------------------------------------------------------------

800
801
subroutine bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
                        tmat, success)
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829

!-------------------------------------------------------------------------------
!  bandred_real: Reduces a distributed symmetric matrix to band form
!
!  Parameters
!
!  na          Order of matrix
!
!  a(lda,*)    Distributed matrix which should be reduced.
!              Distribution is like in Scalapack.
!              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
!              a(:,:) is overwritten on exit with the band and the Householder vectors
!              in the upper half.
!
!  lda         Leading dimension of a
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  nbw         semi bandwith of output matrix
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!
!  tmat(nbw,nbw,num_blocks)    where num_blocks = (na-1)/nbw + 1
!              Factors for the Householder vectors (returned), needed for back transformation
!
!-------------------------------------------------------------------------------
830
831
832
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
833
834
   implicit none

835
836
   integer             :: na, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols
   real*8              :: a(lda,*), tmat(nbw,nbw,*)
837

838
839
840
841
842
   integer             :: my_prow, my_pcol, np_rows, np_cols, mpierr
   integer             :: l_cols, l_rows
   integer             :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
   integer             :: istep, ncol, lch, lcx, nlc
   integer             :: tile_size, l_rows_tile, l_cols_tile
843

844
   real*8              :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
845

846
   real*8, allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:)
847

848
   integer             :: pcol, prow
849
850
851
852
853
854
855
856

#ifdef WITH_QR
   ! needed for blocked QR decomposition
   integer             :: PQRPARAM(11), work_size
   real*8              :: dwork_size(1)
   real*8, allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)
#endif

857
858
859
   pcol(i) = MOD((i-1)/nblk,np_cols) !Processor col for global col number
   prow(i) = MOD((i-1)/nblk,np_rows) !Processor row for global row number

860
861
   logical, intent(out):: success

862
863
864
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("bandred_real")
#endif
865
866
867
868
   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
869
   success = .true.
870
871
872
873
874

   ! Semibandwith nbw must be a multiple of blocksize nblk

   if(mod(nbw,nblk)/=0) then
      if(my_prow==0 .and. my_pcol==0) then
875
876
         write(error_unit,*) 'ERROR: nbw=',nbw,', nblk=',nblk
         write(error_unit,*) 'ELPA2 works only for nbw==n*nblk'
877
878
         success = .false.
!         call mpi_abort(mpi_comm_world,0,mpierr)
879
880
881
882
883
884
885
886
887
888
889
      endif
   endif

   ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

   tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
   tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

   l_rows_tile = tile_size/np_rows ! local rows of a tile
   l_cols_tile = tile_size/np_cols ! local cols of a tile

890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
#ifdef WITH_QR

   if (which_qr_decomposition == 1) then
      call qr_pqrparam_init(pqrparam,    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
      allocate(tauvector(na))
      allocate(blockheuristic(nblk))
      l_rows = local_index(na, my_prow, np_rows, nblk, -1)
      allocate(vmr(max(l_rows,1),na))

      call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), tmat(1,1,1), nbw, dwork_size(1), -1, na, &
                             nbw, nblk, nblk, na, na, 1, 0, PQRPARAM, mpi_comm_rows, mpi_comm_cols, blockheuristic)
      work_size = dwork_size(1)
      allocate(work_blocked(work_size))

      work_blocked = 0.0d0
      deallocate(vmr)
   endif
#endif


910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
   do istep = (na-1)/nbw, 1, -1

      n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step

      ! Number of local columns/rows of remaining matrix
      l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
      l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)

      ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces

      allocate(vmr(max(l_rows,1),2*n_cols))
      allocate(umc(max(l_cols,1),2*n_cols))

      allocate(vr(l_rows+1))

      vmr(1:l_rows,1:n_cols) = 0.
      vr(:) = 0
      tmat(:,:,istep) = 0

      ! Reduce current block to lower triangular form
930
931
932
933
934
935
936
937
938
#ifdef WITH_QR
      if (which_qr_decomposition == 1) then
         call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), &
                                 tmat(1,1,istep), nbw, work_blocked,       &
                                 work_size, na, n_cols, nblk, nblk,        &
                                 istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                 0, PQRPARAM, mpi_comm_rows, mpi_comm_cols,&
                                 blockheuristic)
      else
939

940
941
#endif
        do lc = n_cols, 1, -1
942

943
944
           ncol = istep*nbw + lc ! absolute column number of householder vector
           nrow = ncol - nbw ! Absolute number of pivot row
945

946
947
           lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
           lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
948

949
           tau = 0
950

951
           if(nrow == 1) exit ! Nothing to do
952

953
           cur_pcol = pcol(ncol) ! Processor column owning current block
954

955
           if(my_pcol==cur_pcol) then
956

957
958
              ! Get vector to be transformed; distribute last element and norm of
              ! remaining elements to all procs in current column
959

960
              vr(1:lr) = a(1:lr,lch) ! vector to be transformed
961

962
963
964
965
966
967
968
              if(my_prow==prow(nrow)) then
                 aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
                 aux1(2) = vr(lr)
              else
                 aux1(1) = dot_product(vr(1:lr),vr(1:lr))
                 aux1(2) = 0.
              endif
969

970
              call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
971

972
973
              vnorm2 = aux2(1)
              vrl    = aux2(2)
974

975
              ! Householder transformation
976

977
              call hh_transform_real(vrl, vnorm2, xf, tau)
978

979
              ! Scale vr and store Householder vector for back transformation
980

981
982
983
984
985
986
987
988
              vr(1:lr) = vr(1:lr) * xf
              if(my_prow==prow(nrow)) then
                 a(1:lr-1,lch) = vr(1:lr-1)
                 a(lr,lch) = vrl
                 vr(lr) = 1.
              else
                 a(1:lr,lch) = vr(1:lr)
              endif
989

990
           endif
991

992
           ! Broadcast Householder vector and tau along columns
993

994
995
996
997
998
           vr(lr+1) = tau
           call MPI_Bcast(vr,lr+1,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
           vmr(1:lr,lc) = vr(1:lr)
           tau = vr(lr+1)
           tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
999

1000
1001
           ! Transform remaining columns in current block with Householder vector
           ! Local dot product
1002

1003
           aux1 = 0
1004

1005
1006
1007
1008
1009
1010
1011
1012
           nlc = 0 ! number of local columns
           do j=1,lc-1
              lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
              if(lcx>0) then
                 nlc = nlc+1
                 if(lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
              endif
           enddo
1013

1014
1015
           ! Get global dot products
           if(nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
1016

1017
           ! Transform
1018

1019
1020
1021
1022
1023
1024
1025
1026
           nlc = 0
           do j=1,lc-1
              lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
              if(lcx>0) then
                 nlc = nlc+1
                 a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
              endif
           enddo
1027

1028
        enddo
1029

1030
1031
        ! Calculate scalar products of stored Householder vectors.
        ! This can be done in different ways, we use dsyrk
1032

1033
1034
1035
1036
        vav = 0
        if(l_rows>0) &
           call dsyrk('U','T',n_cols,l_rows,1.d0,vmr,ubound(vmr,1),0.d0,vav,ubound(vav,1))
        call symm_matrix_allreduce(n_cols,vav,ubound(vav,1),mpi_comm_rows)
1037

1038
        ! Calculate triangular matrix T for block Householder Transformation
1039

1040
1041
1042
1043
1044
1045
1046
        do lc=n_cols,1,-1
           tau = tmat(lc,lc,istep)
           if(lc<n_cols) then
              call dtrmv('U','T','N',n_cols-lc,tmat(lc+1,lc+1,istep),ubound(tmat,1),vav(lc+1,lc),1)
              tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
           endif
        enddo
1047

1048
1049
1050
#ifdef WITH_QR
     endif
#endif
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135

      ! Transpose vmr -> vmc (stored in umc, second half)

      call elpa_transpose_vectors  (vmr, ubound(vmr,1), mpi_comm_rows, &
                                    umc(1,n_cols+1), ubound(umc,1), mpi_comm_cols, &
                                    1, istep*nbw, n_cols, nblk)

      ! Calculate umc = A**T * vmr
      ! Note that the distributed A has to be transposed
      ! Opposed to direct tridiagonalization there is no need to use the cache locality
      ! of the tiles, so we can use strips of the matrix

      umc(1:l_cols,1:n_cols) = 0.d0
      vmr(1:l_rows,n_cols+1:2*n_cols) = 0
      if(l_cols>0 .and. l_rows>0) then
         do i=0,(istep*nbw-1)/tile_size

            lcs = i*l_cols_tile+1
            lce = min(l_cols,(i+1)*l_cols_tile)
            if(lce<lcs) cycle

            lre = min(l_rows,(i+1)*l_rows_tile)
            call DGEMM('T','N',lce-lcs+1,n_cols,lre,1.d0,a(1,lcs),ubound(a,1), &
                       vmr,ubound(vmr,1),1.d0,umc(lcs,1),ubound(umc,1))

            if(i==0) cycle
            lre = min(l_rows,i*l_rows_tile)
            call DGEMM('N','N',lre,n_cols,lce-lcs+1,1.d0,a(1,lcs),lda, &
                       umc(lcs,n_cols+1),ubound(umc,1),1.d0,vmr(1,n_cols+1),ubound(vmr,1))
         enddo
      endif

      ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
      ! on the processors containing the diagonal
      ! This is only necessary if ur has been calculated, i.e. if the
      ! global tile size is smaller than the global remaining matrix

      if(tile_size < istep*nbw) then
         call elpa_reduce_add_vectors  (vmr(1,n_cols+1),ubound(vmr,1),mpi_comm_rows, &
                                        umc, ubound(umc,1), mpi_comm_cols, &
                                        istep*nbw, n_cols, nblk)
      endif

      if(l_cols>0) then
         allocate(tmp(l_cols,n_cols))
         call mpi_allreduce(umc,tmp,l_cols*n_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
         umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
         deallocate(tmp)
      endif

      ! U = U * Tmat**T

      call dtrmm('Right','Upper','Trans','Nonunit',l_cols,n_cols,1.d0,tmat(1,1,istep),ubound(tmat,1),umc,ubound(umc,1))

      ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T

      call dgemm('T','N',n_cols,n_cols,l_cols,1.d0,umc,ubound(umc,1),umc(1,n_cols+1),ubound(umc,1),0.d0,vav,ubound(vav,1))
      call dtrmm('Right','Upper','Trans','Nonunit',n_cols,n_cols,1.d0,tmat(1,1,istep),ubound(tmat,1),vav,ubound(vav,1))

      call symm_matrix_allreduce(n_cols,vav,ubound(vav,1),mpi_comm_cols)

      ! U = U - 0.5 * V * VAV
      call dgemm('N','N',l_cols,n_cols,n_cols,-0.5d0,umc(1,n_cols+1),ubound(umc,1),vav,ubound(vav,1),1.d0,umc,ubound(umc,1))

      ! Transpose umc -> umr (stored in vmr, second half)

       call elpa_transpose_vectors  (umc, ubound(umc,1), mpi_comm_cols, &
                                     vmr(1,n_cols+1), ubound(vmr,1), mpi_comm_rows, &
                                     1, istep*nbw, n_cols, nblk)

      ! A = A - V*U**T - U*V**T

      do i=0,(istep*nbw-1)/tile_size
         lcs = i*l_cols_tile+1
         lce = min(l_cols,(i+1)*l_cols_tile)
         lre = min(l_rows,(i+1)*l_rows_tile)
         if(lce<lcs .or. lre<1) cycle
         call dgemm('N','T',lre,lce-lcs+1,2*n_cols,-1.d0, &
                    vmr,ubound(vmr,1),umc(lcs,1),ubound(umc,1), &
                    1.d0,a(1,lcs),lda)
      enddo

      deallocate(vmr, umc, vr)

   enddo
1136
1137
1138
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("bandred_real")
#endif
1139

1140
1141
1142
1143
1144
1145
1146
#ifdef WITH_QR
   if (which_qr_decomposition == 1) then
      deallocate(work_blocked)
      deallocate(tauvector)
   endif
#endif

1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
end subroutine bandred_real

!-------------------------------------------------------------------------------

subroutine symm_matrix_allreduce(n,a,lda,comm)

!-------------------------------------------------------------------------------
!  symm_matrix_allreduce: Does an mpi_allreduce for a symmetric matrix A.
!  On entry, only the upper half of A needs to be set
!  On exit, the complete matrix is set
!-------------------------------------------------------------------------------

   implicit none
   integer n, lda, comm
   real*8 a(lda,*)

   integer i, nc, mpierr
   real*8 h1(n*n), h2(n*n)

   nc = 0
   do i=1,n
      h1(nc+1:nc+i) = a(1:i,i)
      nc = nc+i
   enddo

   call mpi_allreduce(h1,h2,nc,MPI_REAL8,MPI_SUM,comm,mpierr)

   nc = 0
   do i=1,n
      a(1:i,i) = h2(nc+1:nc+i)
      a(i,1:i-1) = a(1:i-1,i)
      nc = nc+i
   enddo

end subroutine symm_matrix_allreduce

!-------------------------------------------------------------------------------

subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq, mpi_comm_rows, mpi_comm_cols)

Andreas Marek's avatar
Andreas Marek committed
1187

1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
!-------------------------------------------------------------------------------
!  trans_ev_band_to_full_real:
!  Transforms the eigenvectors of a band matrix back to the eigenvectors of the original matrix
!
!  Parameters
!
!  na          Order of matrix a, number of rows of matrix q
!
!  nqc         Number of columns of matrix q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  nbw         semi bandwith
!
!  a(lda,*)    Matrix containing the Householder vectors (i.e. matrix a after bandred_real)
!              Distribution is like in Scalapack.
!
!  lda         Leading dimension of a
!
!  tmat(nbw,nbw,.) Factors returned by bandred_real
!
!  q           On input: Eigenvectors of band matrix
!              On output: Transformed eigenvectors
!              Distribution is like in Scalapack.
!
!  ldq         Leading dimension of q
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!
!-------------------------------------------------------------------------------
1220
1221
1222
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
   implicit none

   integer na, nqc, lda, ldq, nblk, nbw, mpi_comm_rows, mpi_comm_cols
   real*8 a(lda,*), q(ldq,*), tmat(nbw, nbw, *)

   integer my_prow, my_pcol, np_rows, np_cols, mpierr
   integer max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
   integer l_cols, l_rows, l_colh, n_cols
   integer istep, lc, ncol, nrow, nb, ns

   real*8, allocatable:: tmp1(:), tmp2(:), hvb(:), hvm(:,:)

   integer pcol, prow, i
1236
1237
1238
1239
1240

#ifdef WITH_QR
   real*8, allocatable  :: tmat_complete(:,:), t_tmp(:,:), t_tmp2(:,:)
   integer :: cwy_blocking, t_blocking, t_cols, t_rows
#endif
1241
1242
1243
   pcol(i) = MOD((i-1)/nblk,np_cols) !Processor col for global col number
   prow(i) = MOD((i-1)/nblk,np_rows) !Processor row for global row number

1244
1245
1246
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("trans_ev_band_to_full_real")
#endif
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)

   max_blocks_row = ((na -1)/nblk)/np_rows + 1  ! Rows of A
   max_blocks_col = ((nqc-1)/nblk)/np_cols + 1  ! Columns of q!

   max_local_rows = max_blocks_row*nblk
   max_local_cols = max_blocks_col*nblk
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
#ifdef WITH_QR
   t_blocking = 2 ! number of matrices T (tmat) which are aggregated into a new (larger) T matrix (tmat_complete) and applied at once
   cwy_blocking = t_blocking * nbw

   allocate(tmp1(max_local_cols*cwy_blocking))
   allocate(tmp2(max_local_cols*cwy_blocking))
   allocate(hvb(max_local_rows*cwy_blocking))
   allocate(hvm(max_local_rows,cwy_blocking))
   allocate(tmat_complete(cwy_blocking,cwy_blocking))
   allocate(t_tmp(cwy_blocking,nbw))
   allocate(t_tmp2(cwy_blocking,nbw))

#else
1271
1272
1273
1274
1275

   allocate(tmp1(max_local_cols*nbw))
   allocate(tmp2(max_local_cols*nbw))
   allocate(hvb(max_local_rows*nbw))
   allocate(hvm(max_local_rows,nbw))
1276
#endif
1277
1278
1279
1280
1281
1282

   hvm = 0   ! Must be set to 0 !!!
   hvb = 0   ! Safety only

   l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q

1283
1284
1285
1286
#ifdef WITH_QR
   do istep=1,((na-1)/nbw-1)/t_blocking + 1
      n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step
#else
1287
1288
1289
   do istep=1,(na-1)/nbw

      n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
1290
#endif
1291
1292
1293
1294
1295
1296
1297

      ! Broadcast all Householder vectors for current step compressed in hvb

      nb = 0
      ns = 0

      do lc = 1, n_cols
1298
1299
1300
#ifdef WITH_QR
         ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder vector
#else
1301
         ncol = istep*nbw + lc ! absolute column number of householder vector
1302
#endif
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
         nrow = ncol - nbw ! absolute number of pivot row

         l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
         l_colh = local_index(ncol  , my_pcol, np_cols, nblk, -1) ! HV local column number

         if(my_pcol==pcol(ncol)) hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)

         nb = nb+l_rows

         if(lc==n_cols .or. mod(ncol,nblk)==0) then
            call MPI_Bcast(hvb(ns+1),nb-ns,MPI_REAL8,pcol(ncol),mpi_comm_cols,mpierr)
            ns = nb
         endif
      enddo

      ! Expand compressed Householder vectors into matrix hvm

      nb = 0
      do lc = 1, n_cols
1322
1323
1324
#ifdef WITH_QR
         nrow = (istep-1)*cwy_blocking + lc ! absolute number of pivot row
#else
1325
         nrow = (istep-1)*nbw+lc ! absolute number of pivot row
1326
#endif
1327
1328
1329
1330
1331
1332
1333
1334
         l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast

         hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
         if(my_prow==prow(nrow)) hvm(l_rows+1,lc) = 1.

         nb = nb+l_rows
      enddo

1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
#ifdef WITH_QR
      l_rows = local_index(MIN(na,(istep+1)*cwy_blocking), my_prow, np_rows, nblk, -1)

      ! compute tmat2 out of tmat(:,:,)
      tmat_complete = 0
      do i = 1, t_blocking
         t_cols = MIN(nbw, n_cols - (i-1)*nbw)
         if(t_cols <= 0) exit
         t_rows = (i - 1) * nbw
         tmat_complete(t_rows+1:t_rows+t_cols,t_rows+1:t_rows+t_cols) = tmat(1:t_cols,1:t_cols,(istep-1)*t_blocking + i)
         if(i > 1) then
            call dgemm('T', 'N', t_rows, t_cols, l_rows, 1.d0, hvm(1,1), max_local_rows, hvm(1,(i-1)*nbw+1), &
                       max_local_rows, 0.d0, t_tmp, cwy_blocking)
            call mpi_allreduce(t_tmp,t_tmp2,cwy_blocking*nbw,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
            call dtrmm('L','U','N','N',t_rows,t_cols,1.0d0,tmat_complete,cwy_blocking,t_tmp2,cwy_blocking)
            call dtrmm('R','U','N','N',t_rows,t_cols,-1.0d0,tmat_complete(t_rows+1,t_rows+1),cwy_blocking,t_tmp2,cwy_blocking)
            tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)
         endif
      enddo
#else
1355
      l_rows = local_index(MIN(na,(istep+1)*nbw), my_prow, np_rows, nblk, -1)
1356
#endif
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367

      ! Q = Q - V * T**T * V**T * Q

      if(l_rows>0) then
         call dgemm('T','N',n_cols,l_cols,l_rows,1.d0,hvm,ubound(hvm,1), &
                    q,ldq,0.d0,tmp1,n_cols)
      else
         tmp1(1:l_cols*n_cols) = 0
      endif
      call mpi_allreduce(tmp1,tmp2,n_cols*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
      if(l_rows>0) then
1368
1369
1370
1371
1372
#ifdef WITH_QR
         call dtrmm('L','U','T','N',n_cols,l_cols,1.0d0,tmat_complete,cwy_blocking,tmp2,n_cols)
         call dgemm('N','N',l_rows,l_cols,n_cols,-1.d0,hvm,ubound(hvm,1), tmp2,n_cols,1.d0,q,ldq)

#else
1373
1374
1375
         call dtrmm('L','U','T','N',n_cols,l_cols,1.0d0,tmat(1,1,istep),ubound(tmat,1),tmp2,n_cols)
         call dgemm('N','N',l_rows,l_cols,n_cols,-1.d0,hvm,ubound(hvm,1), &
                    tmp2,n_cols,1.d0,q,ldq)
1376
#endif
1377
1378
1379
1380
1381
      endif

   enddo

   deallocate(tmp1, tmp2, hvb, hvm)
1382
1383
1384
#ifdef WITH_QR
   deallocate(tmat_complete, t_tmp, t_tmp2)
#endif
1385
1386


1387
1388
1389
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("trans_ev_band_to_full_real")
#endif
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
end subroutine trans_ev_band_to_full_real

! --------------------------------------------------------------------------------------------------

subroutine tridiag_band_real(na, nb, nblk, a, lda, d, e, mpi_comm_rows, mpi_comm_cols, mpi_comm)

!-------------------------------------------------------------------------------
! tridiag_band_real:
! Reduces a real symmetric band matrix to tridiagonal form
!
!  na          Order of matrix a
!
!  nb          Semi bandwith
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  a(lda,*)    Distributed system matrix reduced to banded form in the upper diagonal
!
!  lda         Leading dimension of a
!
!  d(na)       Diagonal of tridiagonal matrix, set only on PE 0 (output)
!
!  e(na)       Subdiagonal of tridiagonal matrix, set only on PE 0 (output)
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm
!              MPI-Communicator for the total processor set
!-------------------------------------------------------------------------------
1420
1421
1422
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
   implicit none

   integer, intent(in) ::  na, nb, nblk, lda, mpi_comm_rows, mpi_comm_cols, mpi_comm
   real*8, intent(in)  :: a(lda,*)
   real*8, intent(out) :: d(na), e(na) ! set only on PE 0


   real*8 vnorm2, hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
   real*8 hd(nb), hs(nb)

   integer i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
   integer my_pe, n_pes, mpierr
   integer my_prow, np_rows, my_pcol, np_cols
   integer ireq_ab, ireq_hv
   integer na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
#ifdef WITH_OPENMP
   integer max_threads, my_thread, my_block_s, my_block_e, iter
   integer mpi_status(MPI_STATUS_SIZE)
   integer, allocatable :: mpi_statuses(:,:), global_id_tmp(:,:)
   integer, allocatable :: omp_block_limits(:)
   real*8, allocatable :: hv_t(:,:), tau_t(:)
#endif
   integer, allocatable :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:)
   integer, allocatable :: limits(:), snd_limits(:,:)
   integer, allocatable :: block_limits(:)
   real*8, allocatable :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
   ! dummies for calling redist_band
   complex*16 :: c_a(1,1), c_ab(1,1)

#ifdef WITH_OPENMP
   integer :: omp_get_max_threads
#endif
1455
1456
1457
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("tridiag_band_real")
#endif
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470