elpa2.F90 213 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
!    http://elpa.rzg.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".


#include "config-f90.h"

module ELPA2

! Version 1.1.2, 2011-02-21

  USE ELPA1

67
68
69
#ifdef HAVE_ISO_FORTRAN_ENV
  use iso_fortran_env, only : error_unit
#endif
70
71
72

  use elpa_pdgeqrf

73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
  implicit none

  PRIVATE ! By default, all routines contained are private

  ! The following routines are public:

  public :: solve_evp_real_2stage
  public :: solve_evp_complex_2stage

  public :: bandred_real
  public :: tridiag_band_real
  public :: trans_ev_tridi_to_band_real
  public :: trans_ev_band_to_full_real

  public :: bandred_complex
  public :: tridiag_band_complex
  public :: trans_ev_tridi_to_band_complex
  public :: trans_ev_band_to_full_complex

Andreas Marek's avatar
Andreas Marek committed
92
93
94
95
96
97
98
99
100
101
102
103
  public :: get_actual_real_kernel_name, get_actual_complex_kernel_name
  public :: REAL_ELPA_KERNEL_GENERIC, REAL_ELPA_KERNEL_GENERIC_SIMPLE, &
            REAL_ELPA_KERNEL_BGP, REAL_ELPA_KERNEL_BGQ,                &
            REAL_ELPA_KERNEL_SSE, REAL_ELPA_KERNEL_AVX_BLOCK2,         &
            REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6

  public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, &
            COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ,                &
            COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_AVX_BLOCK1,         &
            COMPLEX_ELPA_KERNEL_AVX_BLOCK2

  public :: print_available_real_kernels, print_available_complex_kernels
104
105
106
107
#ifndef HAVE_ISO_FORTRAN_ENV
  integer, parameter :: error_unit = 6
#endif

Andreas Marek's avatar
Andreas Marek committed
108
109
110
111
112
113
114
115
116
117
118
119

  integer, parameter :: number_of_real_kernels           = 8
  integer, parameter :: REAL_ELPA_KERNEL_GENERIC         = 1
  integer, parameter :: REAL_ELPA_KERNEL_GENERIC_SIMPLE  = 2
  integer, parameter :: REAL_ELPA_KERNEL_BGP             = 3
  integer, parameter :: REAL_ELPA_KERNEL_BGQ             = 4
  integer, parameter :: REAL_ELPA_KERNEL_SSE             = 5
  integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK2      = 6
  integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK4      = 7
  integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK6      = 8

#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
120
  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
121
#else
122
  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
123
124
#endif
  character(35), parameter, dimension(number_of_real_kernels) :: &
125
126
127
128
129
130
131
132
  REAL_ELPA_KERNEL_NAMES =    (/"REAL_ELPA_KERNEL_GENERIC         ", &
                                "REAL_ELPA_KERNEL_GENERIC_SIMPLE  ", &
                                "REAL_ELPA_KERNEL_BGP             ", &
                                "REAL_ELPA_KERNEL_BGQ             ", &
                                "REAL_ELPA_KERNEL_SSE             ", &
                                "REAL_ELPA_KERNEL_AVX_BLOCK2      ", &
                                "REAL_ELPA_KERNEL_AVX_BLOCK4      ", &
                                "REAL_ELPA_KERNEL_AVX_BLOCK6      "/)
Andreas Marek's avatar
Andreas Marek committed
133
134
135
136
137
138
139
140
141
142
143

  integer, parameter :: number_of_complex_kernels           = 7
  integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC         = 1
  integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE  = 2
  integer, parameter :: COMPLEX_ELPA_KERNEL_BGP             = 3
  integer, parameter :: COMPLEX_ELPA_KERNEL_BGQ             = 4
  integer, parameter :: COMPLEX_ELPA_KERNEL_SSE             = 5
  integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK1      = 6
  integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK2      = 7

#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
144
  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
145
#else
146
  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
147
148
#endif
  character(35), parameter, dimension(number_of_complex_kernels) :: &
149
150
151
152
153
154
155
  COMPLEX_ELPA_KERNEL_NAMES = (/"COMPLEX_ELPA_KERNEL_GENERIC         ", &
                                "COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE  ", &
                                "COMPLEX_ELPA_KERNEL_BGP             ", &
                                "COMPLEX_ELPA_KERNEL_BGQ             ", &
                                "COMPLEX_ELPA_KERNEL_SSE             ", &
                                "COMPLEX_ELPA_KERNEL_AVX_BLOCK1      ", &
                                "COMPLEX_ELPA_KERNEL_AVX_BLOCK2      "/)
Andreas Marek's avatar
Andreas Marek committed
156
157
158
159
160
161
162

  integer, parameter                                    ::             &
           AVAILABLE_REAL_ELPA_KERNELS(number_of_real_kernels) =       &
                                      (/                               &
#if WITH_REAL_GENERIC_KERNEL
                                        1                              &
#else
163
                                        0                              &
Andreas Marek's avatar
Andreas Marek committed
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#endif
#if WITH_REAL_GENERIC_SIMPLE_KERNEL
                                          ,1                           &
#else
                                          ,0                           &
#endif
#if WITH_REAL_BGP_KERNEL
                                            ,1                         &
#else
                                            ,0                         &
#endif
#if WITH_REAL_BGQ_KERNEL
                                              ,1                       &
#else
                                              ,0                       &
#endif
#if WITH_REAL_SSE_KERNEL
                                                ,1                     &
#else
                                                ,0                     &
#endif
#if WITH_REAL_AVX_BLOCK2_KERNEL
                                                  ,1                   &
#else
                                                  ,0                   &
#endif
#if WITH_REAL_AVX_BLOCK4_KERNEL
                                                    ,1                 &
#else
                                                    ,0                 &
#endif
#if WITH_REAL_AVX_BLOCK6_KERNEL
                                                      ,1               &
#else
                                                      ,0               &
#endif
                                                       /)

  integer, parameter ::                                                   &
           AVAILABLE_COMPLEX_ELPA_KERNELS(number_of_complex_kernels) =    &
                                      (/                                  &
#if WITH_COMPLEX_GENERIC_KERNEL
                                        1                                 &
#else
                                        0                                 &
#endif
#if WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
                                          ,1                              &
#else
                                          ,0                              &
#endif
#if WITH_COMPLEX_BGP_KERNEL
                                            ,1                            &
#else
                                            ,0                            &
#endif
#if WITH_COMPLEX_BGQ_KERNEL
                                              ,1                          &
#else
                                              ,0                          &
#endif
#if WITH_COMPLEX_SSE_KERNEL
                                                ,1                        &
#else
                                                ,0                        &
#endif
230
#if WITH_COMPLEX_AVX_BLOCK1_KERNEL
Andreas Marek's avatar
Andreas Marek committed
231
232
233
234
                                                  ,1                      &
#else
                                                  ,0                      &
#endif
235
#if WITH_COMPLEX_AVX_BLOCK2_KERNEL
Andreas Marek's avatar
Andreas Marek committed
236
237
238
239
240
                                                    ,1                    &
#else
                                                    ,0                    &
#endif
                                                   /)
241
242
243
244
245
246
247

  public :: band_band_real
  public :: divide_band

  integer, public :: which_qr_decomposition = 1     ! defines, which QR-decomposition algorithm will be used
                                                    ! 0 for unblocked
                                                    ! 1 for blocked (maxrank: nblk)
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
!-------------------------------------------------------------------------------

  ! The following array contains the Householder vectors of the
  ! transformation band -> tridiagonal.
  ! It is allocated and set in tridiag_band_real and used in
  ! trans_ev_tridi_to_band_real.
  ! It must be deallocated by the user after trans_ev_tridi_to_band_real!

  real*8, allocatable :: hh_trans_real(:,:)
  complex*16, allocatable :: hh_trans_complex(:,:)

!-------------------------------------------------------------------------------

  include 'mpif.h'


!******
contains
266

Andreas Marek's avatar
Andreas Marek committed
267
268
269
270
271
subroutine print_available_real_kernels

  implicit none

  integer :: i
272

Andreas Marek's avatar
Andreas Marek committed
273
  do i=1, number_of_real_kernels
274
275
276
    if (AVAILABLE_REAL_ELPA_KERNELS(i) .eq. 1) then
      write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
    endif
Andreas Marek's avatar
Andreas Marek committed
277
278
279
280
281
282
283
284
285
286
287
288
289
  enddo
  write(error_unit,*) " "
  write(error_unit,*) " At the moment the following kernel would be choosen:"
  write(error_unit,*) get_actual_real_kernel_name()


end subroutine print_available_real_kernels

subroutine print_available_complex_kernels

  implicit none

  integer :: i
290

291
  do i=1, number_of_complex_kernels
292
293
294
    if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .eq. 1) then
       write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
    endif
Andreas Marek's avatar
Andreas Marek committed
295
296
297
  enddo
  write(error_unit,*) " "
  write(error_unit,*) " At the moment the following kernel would be choosen:"
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
298
  write(error_unit,*) get_actual_complex_kernel_name()
Andreas Marek's avatar
Andreas Marek committed
299
300
301
302


end subroutine print_available_complex_kernels

303
function get_actual_real_kernel() result(actual_kernel)
Andreas Marek's avatar
Andreas Marek committed
304
305

  integer :: actual_kernel
306

Andreas Marek's avatar
Andreas Marek committed
307
308
309
  ! if kernel is not choosen via api
  ! check whether set by environment variable
  actual_kernel = real_kernel_via_environment_variable()
310

Andreas Marek's avatar
Andreas Marek committed
311
  if (actual_kernel .eq. 0) then
312
313
    ! if not then set default kernel
    actual_kernel = DEFAULT_REAL_ELPA_KERNEL
Andreas Marek's avatar
Andreas Marek committed
314
315
316
  endif
end function get_actual_real_kernel

317
function get_actual_real_kernel_name() result(actual_kernel_name)
Andreas Marek's avatar
Andreas Marek committed
318
319
320
321
322
323
324

  character(35) :: actual_kernel_name
  integer       :: actual_kernel
  actual_kernel = get_actual_real_kernel()
  actual_kernel_name = REAL_ELPA_KERNEL_NAMES(actual_kernel)
end function get_actual_real_kernel_name

325
function get_actual_complex_kernel() result(actual_kernel)
Andreas Marek's avatar
Andreas Marek committed
326
327

  integer :: actual_kernel
328

Andreas Marek's avatar
Andreas Marek committed
329
330
331
  ! if kernel is not choosen via api
  ! check whether set by environment variable
  actual_kernel = complex_kernel_via_environment_variable()
332

Andreas Marek's avatar
Andreas Marek committed
333
  if (actual_kernel .eq. 0) then
334
335
    ! if not then set default kernel
    actual_kernel = DEFAULT_COMPLEX_ELPA_KERNEL
Andreas Marek's avatar
Andreas Marek committed
336
337
338
  endif
end function get_actual_complex_kernel

339
function get_actual_complex_kernel_name() result(actual_kernel_name)
Andreas Marek's avatar
Andreas Marek committed
340
341
342
343
344
345
346
347
348
349
350

  character(35) :: actual_kernel_name
  integer       :: actual_kernel
  actual_kernel = get_actual_complex_kernel()
  actual_kernel_name = COMPLEX_ELPA_KERNEL_NAMES(actual_kernel)
end function get_actual_complex_kernel_name

function check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL) result(err)

  implicit none
  integer, intent(in) :: THIS_REAL_ELPA_KERNEL
351

Andreas Marek's avatar
Andreas Marek committed
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
  logical             :: err

  err = .false.

  if (AVAILABLE_REAL_ELPA_KERNELS(THIS_REAL_ELPA_KERNEL) .ne. 1) err=.true.

end function check_allowed_real_kernels

function check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL) result(err)

  implicit none
  integer, intent(in) :: THIS_COMPLEX_ELPA_KERNEL

  logical             :: err

  err = .false.

  if (AVAILABLE_COMPLEX_ELPA_KERNELS(THIS_COMPLEX_ELPA_KERNEL) .ne. 1) err=.true.
end function check_allowed_complex_kernels

372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
function qr_decomposition_via_environment_variable(useQR) result(isSet)
  implicit none
  logical, intent(out) :: useQR
  logical              :: isSet
  CHARACTER(len=255)   :: ELPA_QR_DECOMPOSITION

  isSet = .false.

#if defined(HAVE_ENVIRONMENT_CHECKING)
  call get_environment_variable("ELPA_QR_DECOMPOSITION",ELPA_QR_DECOMPOSITION)
#endif
  if (trim(ELPA_QR_DECOMPOSITION) .eq. "yes") then
    useQR = .true.
    isSet = .true.
  endif
  if (trim(ELPA_QR_DECOMPOSITION) .eq. "no") then
    useQR = .false.
    isSet = .true.
  endif

end function qr_decomposition_via_environment_variable


395
function real_kernel_via_environment_variable() result(kernel)
Andreas Marek's avatar
Andreas Marek committed
396
397
398
399
400
401
402
403
404
405
  implicit none
  integer :: kernel
  CHARACTER(len=255) :: REAL_KERNEL_ENVIRONMENT
  integer :: i

#if defined(HAVE_ENVIRONMENT_CHECKING)
  call get_environment_variable("REAL_ELPA_KERNEL",REAL_KERNEL_ENVIRONMENT)
#endif
  do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
!     if (trim(dummy_char) .eq. trim(REAL_ELPA_KERNEL_NAMES(i))) then
406
407
408
409
410
411
    if (trim(REAL_KERNEL_ENVIRONMENT) .eq. trim(REAL_ELPA_KERNEL_NAMES(i))) then
      kernel = i
      exit
    else
      kernel = 0
    endif
Andreas Marek's avatar
Andreas Marek committed
412
413
414
  enddo
end function real_kernel_via_environment_variable

415
function complex_kernel_via_environment_variable() result(kernel)
Andreas Marek's avatar
Andreas Marek committed
416
417
418
419
420
421
422
423
424
425
  implicit none
  integer :: kernel

  CHARACTER(len=255) :: COMPLEX_KERNEL_ENVIRONMENT
  integer :: i
#if defined(HAVE_ENVIRONMENT_CHECKING)
  call get_environment_variable("COMPLEX_ELPA_KERNEL",COMPLEX_KERNEL_ENVIRONMENT)
#endif

  do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
426
427
428
429
430
431
    if (trim(COMPLEX_ELPA_KERNEL_NAMES(i)) .eq. trim(COMPLEX_KERNEL_ENVIRONMENT)) then
      kernel = i
      exit
    else
      kernel = 0
    endif
Andreas Marek's avatar
Andreas Marek committed
432
433
434
435
  enddo

end function complex_kernel_via_environment_variable

436
437
438
439
function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk,        &
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
                                 useQR) result(success)
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474

!-------------------------------------------------------------------------------
!  solve_evp_real_2stage: Solves the real eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
!  a(lda,*)    Distributed matrix for which eigenvalues are to be computed.
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
!  q(ldq,*)    On output: Eigenvectors of a
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
475
476
477
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
478
   implicit none
479
480
   logical, intent(in), optional :: useQR
   logical                       :: useQRActual, useQREnvironment
Andreas Marek's avatar
Andreas Marek committed
481
   integer, intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
482
   integer                       :: THIS_REAL_ELPA_KERNEL
483

484
485
486
   integer, intent(in)           :: na, nev, lda, ldq, nblk, mpi_comm_rows, &
                                    mpi_comm_cols, mpi_comm_all
   real*8, intent(inout)         :: a(lda,*), ev(na), q(ldq,*)
487

488
489
490
491
492
493
   integer                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer                       :: nbw, num_blocks
   real*8, allocatable           :: tmat(:,:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
   logical                       :: success
494

495
496
497
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_real_2stage")
#endif
498
499
500
501
502
503
504
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
505

506
507
   success = .true.

508
509
510
511
512
513
514
515
516
517
518
519
520
521
   useQRActual = .false.

   ! set usage of qr decomposition via API call
   if (present(useQR)) then
     if (useQR) useQRActual = .true.
     if (useQR .and. na .lt. 800) useQRActual = .false.
     if (.not.(useQR)) useQRACtual = .false.
   endif

   ! overwrite this with environment variable settings
   if (qr_decomposition_via_environment_variable(useQREnvironment)) then
     useQRActual = useQREnvironment
   endif

522
523
524
   if (present(THIS_REAL_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
525
   else
526

527
528
529
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
Andreas Marek's avatar
Andreas Marek committed
530
531
532
533
   endif

   ! check whether choosen kernel is allowed
   if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
534

535
536
537
538
539
540
541
542
543
544
545
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
546

547
548
549
550
       write(error_unit,*) " "
       write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !"
     endif
     THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
551
552

   endif
553
554
555
556
557
558
559
560
561
562
563
564
565

   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

   nbw = (31/nblk+1)*nblk

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
566
   call bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
567
                     tmat, success, useQRActual)
568
   if (.not.(success)) return
569
   ttt1 = MPI_Wtime()
570
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
571
      write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
572
573
574
575
576
577

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
578
579
   call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, mpi_comm_rows, &
                          mpi_comm_cols, mpi_comm_all)
580
   ttt1 = MPI_Wtime()
581
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
582
      write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
583
584
585
586
587
588
589
590
591
592

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
593
594
595
596
   call solve_tridi(na, nev, ev, e, q, ldq, nblk, mpi_comm_rows,  &
                    mpi_comm_cols, success)
   if (.not.(success)) return

597
   ttt1 = MPI_Wtime()
598
599
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
     write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
600
601
602
603
604
605
606
607
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   deallocate(e)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
608
609
610
   call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows, &
                                    mpi_comm_cols, success, THIS_REAL_ELPA_KERNEL)
   if (.not.(success)) return
611
   ttt1 = MPI_Wtime()
612
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
613
      write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
614
615
616
617
618
619
620

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_real)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
621
622
   call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, mpi_comm_rows, &
                                   mpi_comm_cols, useQRActual)
623
   ttt1 = MPI_Wtime()
624
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
625
      write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
626
627
628
   time_evp_back = ttt1-ttts

   deallocate(tmat)
629
630
631
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_real_2stage")
#endif
632
633
1  format(a,f10.3)

634
end function solve_evp_real_2stage
635
636
637
638
639

!-------------------------------------------------------------------------------

!-------------------------------------------------------------------------------

640
function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
Andreas Marek's avatar
Andreas Marek committed
641
                                    mpi_comm_rows, mpi_comm_cols,      &
642
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677

!-------------------------------------------------------------------------------
!  solve_evp_complex_2stage: Solves the complex eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
!  a(lda,*)    Distributed matrix for which eigenvalues are to be computed.
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
!  q(ldq,*)    On output: Eigenvectors of a
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
678
679
680
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
681
   implicit none
Andreas Marek's avatar
Andreas Marek committed
682
683
   integer, intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
   integer                       :: THIS_COMPLEX_ELPA_KERNEL
684
685
686
687
688
689
690
691
692
693
   integer, intent(in)           :: na, nev, lda, ldq, nblk, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
   complex*16, intent(inout)     :: a(lda,*), q(ldq,*)
   real*8, intent(inout)         :: ev(na)

   integer                       :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes
   integer                       :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
   complex*16, allocatable       :: tmat(:,:,:)
   real*8, allocatable           :: q_real(:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
694

695
   logical                       :: success
696
697
698
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_complex_2stage")
#endif
Andreas Marek's avatar
Andreas Marek committed
699
700
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
701
702
703
704
705

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
706
707
708

   success = .true.

709
710
711
   if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
712
   else
713
714
715
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel()
Andreas Marek's avatar
Andreas Marek committed
716
   endif
717

Andreas Marek's avatar
Andreas Marek committed
718
719
   ! check whether choosen kernel is allowed
   if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then
720

721
722
723
724
725
726
727
728
729
730
731
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
732

733
734
735
736
       write(error_unit,*) " "
       write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
     endif
     THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
737
738
!      call MPI_ABORT(mpi_comm_all, mpierr)
   endif
739
740
741
742
743
744
745
746
747
748
749
750
   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

   nbw = (31/nblk+1)*nblk

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
751
752
   call bandred_complex(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
                        tmat, success)
753
754
755
756
757
758
   if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
     call timer%stop()
#endif
     return
   endif
759
   ttt1 = MPI_Wtime()
760
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
761
      write(error_unit,*) 'Time bandred_complex               :',ttt1-ttt0
762
763
764
765
766
767
768
769

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
   call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
   ttt1 = MPI_Wtime()
770
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
771
      write(error_unit,*) 'Time tridiag_band_complex          :',ttt1-ttt0
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
   l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev

   allocate(q_real(l_rows,l_cols))

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
788
789
790
791
   call solve_tridi(na, nev, ev, e, q_real, ubound(q_real,1), nblk, &
                    mpi_comm_rows, mpi_comm_cols, success)
   if (.not.(success)) return

792
   ttt1 = MPI_Wtime()
793
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times)  &
794
      write(error_unit,*) 'Time solve_tridi                   :',ttt1-ttt0
795
796
797
798
799
800
801
802
803
804
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)

   deallocate(e, q_real)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
805
   call trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq,  &
806
807
808
                                       mpi_comm_rows, mpi_comm_cols,&
                                       success,THIS_COMPLEX_ELPA_KERNEL)
   if (.not.(success)) return
809
   ttt1 = MPI_Wtime()
810
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
811
      write(error_unit,*) 'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0
812
813
814
815
816
817
818
819
820

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_complex)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
   call trans_ev_band_to_full_complex(na, nev, nblk, nbw, a, lda, tmat, q, ldq, mpi_comm_rows, mpi_comm_cols)
   ttt1 = MPI_Wtime()
821
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
822
      write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0
823
824
825
   time_evp_back = ttt1-ttts

   deallocate(tmat)
826
827
828
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_complex_2stage")
#endif
829
830
831

1  format(a,f10.3)

832
end function solve_evp_complex_2stage
833
834
835

!-------------------------------------------------------------------------------

836
subroutine bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
837
                        tmat, success, useQR)
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865

!-------------------------------------------------------------------------------
!  bandred_real: Reduces a distributed symmetric matrix to band form
!
!  Parameters
!
!  na          Order of matrix
!
!  a(lda,*)    Distributed matrix which should be reduced.
!              Distribution is like in Scalapack.
!              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
!              a(:,:) is overwritten on exit with the band and the Householder vectors
!              in the upper half.
!
!  lda         Leading dimension of a
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  nbw         semi bandwith of output matrix
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!
!  tmat(nbw,nbw,num_blocks)    where num_blocks = (na-1)/nbw + 1
!              Factors for the Householder vectors (returned), needed for back transformation
!
!-------------------------------------------------------------------------------
866
867
868
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
869
870
   implicit none

871
872
   integer             :: na, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols
   real*8              :: a(lda,*), tmat(nbw,nbw,*)
873

874
875
876
877
878
   integer             :: my_prow, my_pcol, np_rows, np_cols, mpierr
   integer             :: l_cols, l_rows
   integer             :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
   integer             :: istep, ncol, lch, lcx, nlc
   integer             :: tile_size, l_rows_tile, l_cols_tile
879

880
   real*8              :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
881

882
   real*8, allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:)
883

884
   integer             :: pcol, prow
885
886
887
888
889
890

   ! needed for blocked QR decomposition
   integer             :: PQRPARAM(11), work_size
   real*8              :: dwork_size(1)
   real*8, allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)

891
892
893
   pcol(i) = MOD((i-1)/nblk,np_cols) !Processor col for global col number
   prow(i) = MOD((i-1)/nblk,np_rows) !Processor row for global row number

894
895
   logical, intent(out):: success

896
897
   logical, intent(in) :: useQR

898
899
900
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("bandred_real")
#endif
901
902
903
904
   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
905
   success = .true.
906
907
908

   ! Semibandwith nbw must be a multiple of blocksize nblk

909
910
911
912
913
   if (mod(nbw,nblk)/=0) then
     if (my_prow==0 .and. my_pcol==0) then
       write(error_unit,*) 'ERROR: nbw=',nbw,', nblk=',nblk
       write(error_unit,*) 'ELPA2 works only for nbw==n*nblk'
       success = .false.
914
!         call mpi_abort(mpi_comm_world,0,mpierr)
915
     endif
916
917
918
919
920
921
922
923
924
925
   endif

   ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

   tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
   tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

   l_rows_tile = tile_size/np_rows ! local rows of a tile
   l_cols_tile = tile_size/np_cols ! local cols of a tile

926
927
928
929
930
931
932
   if (useQR) then
     if (which_qr_decomposition == 1) then
       call qr_pqrparam_init(pqrparam,    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
       allocate(tauvector(na))
       allocate(blockheuristic(nblk))
       l_rows = local_index(na, my_prow, np_rows, nblk, -1)
       allocate(vmr(max(l_rows,1),na))
933

934
       call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), tmat(1,1,1), nbw, dwork_size(1), -1, na, &
935
                             nbw, nblk, nblk, na, na, 1, 0, PQRPARAM, mpi_comm_rows, mpi_comm_cols, blockheuristic)
936
937
       work_size = dwork_size(1)
       allocate(work_blocked(work_size))
938

939
940
941
       work_blocked = 0.0d0
       deallocate(vmr)
     endif
942
943
   endif

944
945
   do istep = (na-1)/nbw, 1, -1

946
     n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
947

948
949
950
     ! Number of local columns/rows of remaining matrix
     l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
     l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
951

952
     ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
953

954
955
     allocate(vmr(max(l_rows,1),2*n_cols))
     allocate(umc(max(l_cols,1),2*n_cols))
956

957
     allocate(vr(l_rows+1))
958

959
960
961
     vmr(1:l_rows,1:n_cols) = 0.
     vr(:) = 0
     tmat(:,:,istep) = 0
962

963
     ! Reduce current block to lower triangular form
964
965
966
967
968
969
970
971
972
973

     if (useQR) then
       if (which_qr_decomposition == 1) then
         call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), &
                                  tmat(1,1,istep), nbw, work_blocked,       &
                                  work_size, na, n_cols, nblk, nblk,        &
                                  istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                  0, PQRPARAM, mpi_comm_rows, mpi_comm_cols,&
                                  blockheuristic)
       endif
974
     else
975

976
       do lc = n_cols, 1, -1
977

978
979
         ncol = istep*nbw + lc ! absolute column number of householder vector
         nrow = ncol - nbw ! Absolute number of pivot row
980

981
982
         lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
         lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
983

984
         tau = 0
985

986
         if (nrow == 1) exit ! Nothing to do
987

988
         cur_pcol = pcol(ncol) ! Processor column owning current block
989

990
         if (my_pcol==cur_pcol) then
991

992
993
           ! Get vector to be transformed; distribute last element and norm of
           ! remaining elements to all procs in current column
994

995
           vr(1:lr) = a(1:lr,lch) ! vector to be transformed
996

997
998
999
1000
1001
1002
1003
           if (my_prow==prow(nrow)) then
             aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
             aux1(2) = vr(lr)
           else
             aux1(1) = dot_product(vr(1:lr),vr(1:lr))
             aux1(2) = 0.
           endif
1004

1005
           call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
1006

1007
1008
           vnorm2 = aux2(1)
           vrl    = aux2(2)
1009

1010
           ! Householder transformation
1011

1012
           call hh_transform_real(vrl, vnorm2, xf, tau)
1013

1014
           ! Scale vr and store Householder vector for back transformation
1015

1016
1017
1018
1019
1020
1021
1022
           vr(1:lr) = vr(1:lr) * xf
           if (my_prow==prow(nrow)) then
             a(1:lr-1,lch) = vr(1:lr-1)
             a(lr,lch) = vrl
             vr(lr) = 1.
           else
             a(1:lr,lch) = vr(1:lr)
1023
           endif
1024

1025
         endif
1026

1027
         ! Broadcast Householder vector and tau along columns
1028

1029
1030
1031
1032
1033
         vr(lr+1) = tau
         call MPI_Bcast(vr,lr+1,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
         vmr(1:lr,lc) = vr(1:lr)
         tau = vr(lr+1)
         tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
1034

1035
1036
         ! Transform remaining columns in current block with Householder vector
         ! Local dot product
1037

1038
         aux1 = 0
1039

1040
1041
1042
1043
1044
1045
1046
1047
         nlc = 0 ! number of local columns
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             nlc = nlc+1
             if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
           endif
         enddo
1048

1049
1050
         ! Get global dot products
         if (nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
1051

1052
         ! Transform
1053

1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
         nlc = 0
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             nlc = nlc+1
             a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
           endif
         enddo

       enddo
1064

1065
1066
       ! Calculate scalar products of stored Householder vectors.
       ! This can be done in different ways, we use dsyrk
1067

1068
1069
       vav = 0
       if (l_rows>0) &
1070
           call dsyrk('U','T',n_cols,l_rows,1.d0,vmr,ubound(vmr,1),0.d0,vav,ubound(vav,1))
1071
       call symm_matrix_allreduce(n_cols,vav,ubound(vav,1),mpi_comm_rows)
1072

1073
       ! Calculate triangular matrix T for block Householder Transformation
1074

1075
1076
1077
1078
1079
1080
1081
       do lc=n_cols,1,-1
         tau = tmat(lc,lc,istep)
         if (lc<n_cols) then
           call dtrmv('U','T','N',n_cols-lc,tmat(lc+1,lc+1,istep),ubound(tmat,1),vav(lc+1,lc),1)
           tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
         endif
       enddo
1082
     endif
1083

1084
    ! Transpose vmr -> vmc (stored in umc, second half)
1085

1086
    call elpa_transpose_vectors  (vmr, ubound(vmr,1), mpi_comm_rows, &
1087
1088
1089
                                    umc(1,n_cols+1), ubound(umc,1), mpi_comm_cols, &
                                    1, istep*nbw, n_cols, nblk)

1090
1091
1092
1093
    ! Calculate umc = A**T * vmr
    ! Note that the distributed A has to be transposed
    ! Opposed to direct tridiagonalization there is no need to use the cache locality
    ! of the tiles, so we can use strips of the matrix
1094

1095
1096
1097
1098
    umc(1:l_cols,1:n_cols) = 0.d0
    vmr(1:l_rows,n_cols+1:2*n_cols) = 0
    if (l_cols>0 .and. l_rows>0) then
      do i=0,(istep*nbw-1)/tile_size
1099

1100
1101
1102
        lcs = i*l_cols_tile+1
        lce = min(l_cols,(i+1)*l_cols_tile)
        if (lce<lcs) cycle
1103

1104
1105
1106
        lre = min(l_rows,(i+1)*l_rows_tile)
        call DGEMM('T','N',lce-lcs+1,n_cols,lre,1.d0,a(1,lcs),ubound(a,1), &
                     vmr,ubound(vmr,1),1.d0,umc(lcs,1),ubound(umc,1))
1107

1108
1109
1110
1111
1112
1113
        if (i==0) cycle
        lre = min(l_rows,i*l_rows_tile)
        call DGEMM('N','N',lre,n_cols,lce-lcs+1,1.d0,a(1,lcs),lda, &
                     umc(lcs,n_cols+1),ubound(umc,1),1.d0,vmr(1,n_cols+1),ubound(vmr,1))
      enddo
    endif
1114

1115
1116
1117
1118
    ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
    ! on the processors containing the diagonal
    ! This is only necessary if ur has been calculated, i.e. if the
    ! global tile size is smaller than the global remaining matrix
1119

1120
1121
1122
1123
1124
    if (tile_size < istep*nbw) then
       call elpa_reduce_add_vectors  (vmr(1,n_cols+1),ubound(vmr,1),mpi_comm_rows, &
                                      umc, ubound(umc,1), mpi_comm_cols, &
                                      istep*nbw, n_cols, nblk)
    endif
1125

1126
1127
1128
1129
1130
1131
    if (l_cols>0) then
      allocate(tmp(l_cols,n_cols))
      call mpi_allreduce(umc,tmp,l_cols*n_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
      umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
      deallocate(tmp)
    endif
1132

1133
    ! U = U * Tmat**T
1134

1135
    call dtrmm('Right','Upper','Trans','Nonunit',l_cols,n_cols,1.d0,tmat(1,1,istep),ubound(tmat,1),umc,ubound(umc,1))
1136

1137
    ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
1138

1139
1140
    call dgemm('T','N',n_cols,n_cols,l_cols,1.d0,umc,ubound(umc,1),umc(1,n_cols+1),ubound(umc,1),0.d0,vav,ubound(vav,1))
    call dtrmm('Right','Upper','Trans','Nonunit',n_cols,n_cols,1.d0,tmat(1,1,istep),ubound(tmat,1),vav,ubound(vav,1))
1141

1142
    call symm_matrix_allreduce(n_cols,vav,ubound(vav,1),mpi_comm_cols)
1143

1144
1145
    ! U = U - 0.5 * V * VAV
    call dgemm('N','N',l_cols,n_cols,n_cols,-0.5d0,umc(1,n_cols+1),ubound(umc,1),vav,ubound(vav,1),1.d0,umc,ubound(umc,1))
1146

1147
    ! Transpose umc -> umr (stored in vmr, second half)
1148

1149
1150
1151
    call elpa_transpose_vectors  (umc, ubound(umc,1), mpi_comm_cols, &
                                   vmr(1,n_cols+1), ubound(vmr,1), mpi_comm_rows, &
                                   1, istep*nbw, n_cols, nblk)
1152

1153
    ! A = A - V*U**T - U*V**T
1154

1155
1156
1157
1158
1159
1160
1161
1162
1163
    do i=0,(istep*nbw-1)/tile_size
      lcs = i*l_cols_tile+1
      lce = min(l_cols,(i+1)*l_cols_tile)
      lre = min(l_rows,(i+1)*l_rows_tile)
      if (lce<lcs .or. lre<1) cycle
      call dgemm('N','T',lre,lce-lcs+1,2*n_cols,-1.d0, &
                  vmr,ubound(vmr,1),umc(lcs,1),ubound(umc,1), &
                  1.d0,a(1,lcs),lda)
    enddo
1164

1165
    deallocate(vmr, umc, vr)
1166

1167
  enddo
1168
#ifdef HAVE_DETAILED_TIMINGS
1169
  call timer%stop("bandred_real")
1170
#endif
1171

1172
1173
1174
1175
1176
  if (useQR) then
    if (which_qr_decomposition == 1) then
      deallocate(work_blocked)
      deallocate(tauvector)
    endif
1177
  endif
1178

1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
end subroutine bandred_real

!-------------------------------------------------------------------------------

subroutine symm_matrix_allreduce(n,a,lda,comm)

!-------------------------------------------------------------------------------
!  symm_matrix_allreduce: Does an mpi_allreduce for a symmetric matrix A.
!  On entry, only the upper half of A needs to be set
!  On exit, the complete matrix is set
!-------------------------------------------------------------------------------

   implicit none
   integer n, lda, comm
   real*8 a(lda,*)

   integer i, nc, mpierr
   real*8 h1(n*n), h2(n*n)

   nc = 0
   do i=1,n
1200
1201
     h1(nc+1:nc+i) = a(1:i,i)
     nc = nc+i
1202
1203
1204
1205
1206
1207
   enddo

   call mpi_allreduce(h1,h2,nc,MPI_REAL8,MPI_SUM,comm,mpierr)

   nc = 0
   do i=1,n
1208
1209
1210
     a(1:i,i) = h2(nc+1:nc+i)
     a(i,1:i-1) = a(1:i-1,i)
     nc = nc+i
1211
1212
1213
1214
1215
1216
   enddo

end subroutine symm_matrix_allreduce

!-------------------------------------------------------------------------------

1217
1218
subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq, mpi_comm_rows, &
                                      mpi_comm_cols, useQR)
1219

Andreas Marek's avatar
Andreas Marek committed
1220

1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
!-------------------------------------------------------------------------------
!  trans_ev_band_to_full_real:
!  Transforms the eigenvectors of a band matrix back to the eigenvectors of the original matrix
!
!  Parameters
!
!  na          Order of matrix a, number of rows of matrix q
!
!  nqc         Number of columns of matrix q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  nbw         semi bandwith
!
!  a(lda,*)    Matrix containing the Householder vectors (i.e. matrix a after bandred_real)
!              Distribution is like in Scalapack.
!
!  lda         Leading dimension of a
!
!  tmat(nbw,nbw,.) Factors returned by bandred_real
!
!  q           On input: Eigenvectors of band matrix
!              On output: Transformed eigenvectors
!              Distribution is like in Scalapack.
!
!  ldq         Leading dimension of q
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!
!-------------------------------------------------------------------------------
1253
1254
1255
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
1256
1257
   implicit none

1258
1259
   integer              :: na, nqc, lda, ldq, nblk, nbw, mpi_comm_rows, mpi_comm_cols
   real*8               :: a(lda,*), q(ldq,*), tmat(nbw, nbw, *)
1260