elpa2.F90 201 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
!    http://elpa.rzg.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".


#include "config-f90.h"

module ELPA2

! Version 1.1.2, 2011-02-21

  USE ELPA1

67
68
69
#ifdef HAVE_ISO_FORTRAN_ENV
  use iso_fortran_env, only : error_unit
#endif
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
  implicit none

  PRIVATE ! By default, all routines contained are private

  ! The following routines are public:

  public :: solve_evp_real_2stage
  public :: solve_evp_complex_2stage

  public :: bandred_real
  public :: tridiag_band_real
  public :: trans_ev_tridi_to_band_real
  public :: trans_ev_band_to_full_real

  public :: bandred_complex
  public :: tridiag_band_complex
  public :: trans_ev_tridi_to_band_complex
  public :: trans_ev_band_to_full_complex

Andreas Marek's avatar
Andreas Marek committed
89
90
91
92
93
94
95
96
97
98
99
100
  public :: get_actual_real_kernel_name, get_actual_complex_kernel_name
  public :: REAL_ELPA_KERNEL_GENERIC, REAL_ELPA_KERNEL_GENERIC_SIMPLE, &
            REAL_ELPA_KERNEL_BGP, REAL_ELPA_KERNEL_BGQ,                &
            REAL_ELPA_KERNEL_SSE, REAL_ELPA_KERNEL_AVX_BLOCK2,         &
            REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6

  public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, &
            COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ,                &
            COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_AVX_BLOCK1,         &
            COMPLEX_ELPA_KERNEL_AVX_BLOCK2

  public :: print_available_real_kernels, print_available_complex_kernels
101
102
103
104
#ifndef HAVE_ISO_FORTRAN_ENV
  integer, parameter :: error_unit = 6
#endif

Andreas Marek's avatar
Andreas Marek committed
105
106
107
108
109
110
111
112
113
114
115
116

  integer, parameter :: number_of_real_kernels           = 8
  integer, parameter :: REAL_ELPA_KERNEL_GENERIC         = 1
  integer, parameter :: REAL_ELPA_KERNEL_GENERIC_SIMPLE  = 2
  integer, parameter :: REAL_ELPA_KERNEL_BGP             = 3
  integer, parameter :: REAL_ELPA_KERNEL_BGQ             = 4
  integer, parameter :: REAL_ELPA_KERNEL_SSE             = 5
  integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK2      = 6
  integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK4      = 7
  integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK6      = 8

#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
117
  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
118
#else
119
  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
120
121
#endif
  character(35), parameter, dimension(number_of_real_kernels) :: &
122
123
124
125
126
127
128
129
  REAL_ELPA_KERNEL_NAMES =    (/"REAL_ELPA_KERNEL_GENERIC         ", &
                                "REAL_ELPA_KERNEL_GENERIC_SIMPLE  ", &
                                "REAL_ELPA_KERNEL_BGP             ", &
                                "REAL_ELPA_KERNEL_BGQ             ", &
                                "REAL_ELPA_KERNEL_SSE             ", &
                                "REAL_ELPA_KERNEL_AVX_BLOCK2      ", &
                                "REAL_ELPA_KERNEL_AVX_BLOCK4      ", &
                                "REAL_ELPA_KERNEL_AVX_BLOCK6      "/)
Andreas Marek's avatar
Andreas Marek committed
130
131
132
133
134
135
136
137
138
139
140

  integer, parameter :: number_of_complex_kernels           = 7
  integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC         = 1
  integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE  = 2
  integer, parameter :: COMPLEX_ELPA_KERNEL_BGP             = 3
  integer, parameter :: COMPLEX_ELPA_KERNEL_BGQ             = 4
  integer, parameter :: COMPLEX_ELPA_KERNEL_SSE             = 5
  integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK1      = 6
  integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK2      = 7

#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
141
  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
142
#else
143
  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
144
145
#endif
  character(35), parameter, dimension(number_of_complex_kernels) :: &
146
147
148
149
150
151
152
  COMPLEX_ELPA_KERNEL_NAMES = (/"COMPLEX_ELPA_KERNEL_GENERIC         ", &
                                "COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE  ", &
                                "COMPLEX_ELPA_KERNEL_BGP             ", &
                                "COMPLEX_ELPA_KERNEL_BGQ             ", &
                                "COMPLEX_ELPA_KERNEL_SSE             ", &
                                "COMPLEX_ELPA_KERNEL_AVX_BLOCK1      ", &
                                "COMPLEX_ELPA_KERNEL_AVX_BLOCK2      "/)
Andreas Marek's avatar
Andreas Marek committed
153
154
155
156
157
158
159

  integer, parameter                                    ::             &
           AVAILABLE_REAL_ELPA_KERNELS(number_of_real_kernels) =       &
                                      (/                               &
#if WITH_REAL_GENERIC_KERNEL
                                        1                              &
#else
160
                                        0                              &
Andreas Marek's avatar
Andreas Marek committed
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#endif
#if WITH_REAL_GENERIC_SIMPLE_KERNEL
                                          ,1                           &
#else
                                          ,0                           &
#endif
#if WITH_REAL_BGP_KERNEL
                                            ,1                         &
#else
                                            ,0                         &
#endif
#if WITH_REAL_BGQ_KERNEL
                                              ,1                       &
#else
                                              ,0                       &
#endif
#if WITH_REAL_SSE_KERNEL
                                                ,1                     &
#else
                                                ,0                     &
#endif
#if WITH_REAL_AVX_BLOCK2_KERNEL
                                                  ,1                   &
#else
                                                  ,0                   &
#endif
#if WITH_REAL_AVX_BLOCK4_KERNEL
                                                    ,1                 &
#else
                                                    ,0                 &
#endif
#if WITH_REAL_AVX_BLOCK6_KERNEL
                                                      ,1               &
#else
                                                      ,0               &
#endif
                                                       /)

  integer, parameter ::                                                   &
           AVAILABLE_COMPLEX_ELPA_KERNELS(number_of_complex_kernels) =    &
                                      (/                                  &
#if WITH_COMPLEX_GENERIC_KERNEL
                                        1                                 &
#else
                                        0                                 &
#endif
#if WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
                                          ,1                              &
#else
                                          ,0                              &
#endif
#if WITH_COMPLEX_BGP_KERNEL
                                            ,1                            &
#else
                                            ,0                            &
#endif
#if WITH_COMPLEX_BGQ_KERNEL
                                              ,1                          &
#else
                                              ,0                          &
#endif
#if WITH_COMPLEX_SSE_KERNEL
                                                ,1                        &
#else
                                                ,0                        &
#endif
227
#if WITH_COMPLEX_AVX_BLOCK1_KERNEL
Andreas Marek's avatar
Andreas Marek committed
228
229
230
231
                                                  ,1                      &
#else
                                                  ,0                      &
#endif
232
#if WITH_COMPLEX_AVX_BLOCK2_KERNEL
Andreas Marek's avatar
Andreas Marek committed
233
234
235
236
237
                                                    ,1                    &
#else
                                                    ,0                    &
#endif
                                                   /)
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
!-------------------------------------------------------------------------------

  ! The following array contains the Householder vectors of the
  ! transformation band -> tridiagonal.
  ! It is allocated and set in tridiag_band_real and used in
  ! trans_ev_tridi_to_band_real.
  ! It must be deallocated by the user after trans_ev_tridi_to_band_real!

  real*8, allocatable :: hh_trans_real(:,:)
  complex*16, allocatable :: hh_trans_complex(:,:)

!-------------------------------------------------------------------------------

  include 'mpif.h'


!******
contains
Andreas Marek's avatar
Andreas Marek committed
256
257
258
259
260
subroutine print_available_real_kernels

  implicit none

  integer :: i
261

Andreas Marek's avatar
Andreas Marek committed
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
  do i=1, number_of_real_kernels
     if (AVAILABLE_REAL_ELPA_KERNELS(i) .eq. 1) then
        write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
     endif
  enddo
  write(error_unit,*) " "
  write(error_unit,*) " At the moment the following kernel would be choosen:"
  write(error_unit,*) get_actual_real_kernel_name()


end subroutine print_available_real_kernels

subroutine print_available_complex_kernels

  implicit none

  integer :: i
279

280
281
282
  do i=1, number_of_complex_kernels
     if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .eq. 1) then
        write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
Andreas Marek's avatar
Andreas Marek committed
283
284
285
286
     endif
  enddo
  write(error_unit,*) " "
  write(error_unit,*) " At the moment the following kernel would be choosen:"
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
287
  write(error_unit,*) get_actual_complex_kernel_name()
Andreas Marek's avatar
Andreas Marek committed
288
289
290
291


end subroutine print_available_complex_kernels

292
function get_actual_real_kernel() result(actual_kernel)
Andreas Marek's avatar
Andreas Marek committed
293
294

  integer :: actual_kernel
295

Andreas Marek's avatar
Andreas Marek committed
296
297
298
  ! if kernel is not choosen via api
  ! check whether set by environment variable
  actual_kernel = real_kernel_via_environment_variable()
299

Andreas Marek's avatar
Andreas Marek committed
300
301
302
303
304
305
  if (actual_kernel .eq. 0) then
     ! if not then set default kernel
     actual_kernel = DEFAULT_REAL_ELPA_KERNEL
  endif
end function get_actual_real_kernel

306
function get_actual_real_kernel_name() result(actual_kernel_name)
Andreas Marek's avatar
Andreas Marek committed
307
308
309
310
311
312
313

  character(35) :: actual_kernel_name
  integer       :: actual_kernel
  actual_kernel = get_actual_real_kernel()
  actual_kernel_name = REAL_ELPA_KERNEL_NAMES(actual_kernel)
end function get_actual_real_kernel_name

314
function get_actual_complex_kernel() result(actual_kernel)
Andreas Marek's avatar
Andreas Marek committed
315
316

  integer :: actual_kernel
317

Andreas Marek's avatar
Andreas Marek committed
318
319
320
  ! if kernel is not choosen via api
  ! check whether set by environment variable
  actual_kernel = complex_kernel_via_environment_variable()
321

Andreas Marek's avatar
Andreas Marek committed
322
323
324
325
326
327
  if (actual_kernel .eq. 0) then
     ! if not then set default kernel
     actual_kernel = DEFAULT_COMPLEX_ELPA_KERNEL
  endif
end function get_actual_complex_kernel

328
function get_actual_complex_kernel_name() result(actual_kernel_name)
Andreas Marek's avatar
Andreas Marek committed
329
330
331
332
333
334
335
336
337
338
339

  character(35) :: actual_kernel_name
  integer       :: actual_kernel
  actual_kernel = get_actual_complex_kernel()
  actual_kernel_name = COMPLEX_ELPA_KERNEL_NAMES(actual_kernel)
end function get_actual_complex_kernel_name

function check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL) result(err)

  implicit none
  integer, intent(in) :: THIS_REAL_ELPA_KERNEL
340

Andreas Marek's avatar
Andreas Marek committed
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
  logical             :: err

  err = .false.

  if (AVAILABLE_REAL_ELPA_KERNELS(THIS_REAL_ELPA_KERNEL) .ne. 1) err=.true.

end function check_allowed_real_kernels

function check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL) result(err)

  implicit none
  integer, intent(in) :: THIS_COMPLEX_ELPA_KERNEL

  logical             :: err

  err = .false.

  if (AVAILABLE_COMPLEX_ELPA_KERNELS(THIS_COMPLEX_ELPA_KERNEL) .ne. 1) err=.true.
end function check_allowed_complex_kernels

361
function real_kernel_via_environment_variable() result(kernel)
Andreas Marek's avatar
Andreas Marek committed
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
  implicit none
  integer :: kernel
  CHARACTER(len=255) :: REAL_KERNEL_ENVIRONMENT
  integer :: i

#if defined(HAVE_ENVIRONMENT_CHECKING)
  call get_environment_variable("REAL_ELPA_KERNEL",REAL_KERNEL_ENVIRONMENT)
#endif
  do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
!     if (trim(dummy_char) .eq. trim(REAL_ELPA_KERNEL_NAMES(i))) then
     if (trim(REAL_KERNEL_ENVIRONMENT) .eq. trim(REAL_ELPA_KERNEL_NAMES(i))) then
        kernel = i
        exit
     else
        kernel = 0
     endif
  enddo


end function real_kernel_via_environment_variable

383
function complex_kernel_via_environment_variable() result(kernel)
Andreas Marek's avatar
Andreas Marek committed
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
  implicit none
  integer :: kernel

  CHARACTER(len=255) :: COMPLEX_KERNEL_ENVIRONMENT
  integer :: i
#if defined(HAVE_ENVIRONMENT_CHECKING)
  call get_environment_variable("COMPLEX_ELPA_KERNEL",COMPLEX_KERNEL_ENVIRONMENT)
#endif

  do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
     if (trim(COMPLEX_ELPA_KERNEL_NAMES(i)) .eq. trim(COMPLEX_KERNEL_ENVIRONMENT)) then
        kernel = i
        exit
     else
        kernel = 0
     endif
  enddo

end function complex_kernel_via_environment_variable

404
function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk,   &
Andreas Marek's avatar
Andreas Marek committed
405
                                 mpi_comm_rows, mpi_comm_cols,        &
406
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API) result(success)
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441

!-------------------------------------------------------------------------------
!  solve_evp_real_2stage: Solves the real eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
!  a(lda,*)    Distributed matrix for which eigenvalues are to be computed.
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
!  q(ldq,*)    On output: Eigenvectors of a
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
442
443
444
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
445
   implicit none
Andreas Marek's avatar
Andreas Marek committed
446
   integer, intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
447
   integer                       :: THIS_REAL_ELPA_KERNEL
448

449
450
451
   integer, intent(in)           :: na, nev, lda, ldq, nblk, mpi_comm_rows, &
                                    mpi_comm_cols, mpi_comm_all
   real*8, intent(inout)         :: a(lda,*), ev(na), q(ldq,*)
452

453
454
455
456
457
458
   integer                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer                       :: nbw, num_blocks
   real*8, allocatable           :: tmat(:,:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
   logical                       :: success
459
460
461
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_real_2stage")
#endif
462
463
464
465
466
467
468
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
469
470
471
  
   success = .true.

Andreas Marek's avatar
Andreas Marek committed
472
473
474
475
 if (present(THIS_REAL_ELPA_KERNEL_API)) then
      ! user defined kernel via the optional argument in the API call
      THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
   else
476

Andreas Marek's avatar
Andreas Marek committed
477
478
479
480
481
482
483
      ! if kernel is not choosen via api
      ! check whether set by environment variable
      THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
   endif

   ! check whether choosen kernel is allowed
   if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
484

Andreas Marek's avatar
Andreas Marek committed
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
      if(my_pe == 0) then
         write(error_unit,*) " "
         write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
         write(error_unit,*) "is not in the list of the allowed kernels!"
         write(error_unit,*) " "
         write(error_unit,*) "Allowed kernels are:"
         do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
            if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
               write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
            endif
         enddo

         write(error_unit,*) " "
         write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !"
      endif
         THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC

   endif
503
504
505
506
507
508
509
510
511
512
513
514
515

   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

   nbw = (31/nblk+1)*nblk

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
516
517
518
   call bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
                     tmat, success)
   if (.not.(success)) return
519
520
   ttt1 = MPI_Wtime()
   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
521
      write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
522
523
524
525
526
527

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
528
529
   call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, mpi_comm_rows, &
                          mpi_comm_cols, mpi_comm_all)
530
531
   ttt1 = MPI_Wtime()
   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
532
      write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
533
534
535
536
537
538
539
540
541
542

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
543
544
545
546
   call solve_tridi(na, nev, ev, e, q, ldq, nblk, mpi_comm_rows,  &
                    mpi_comm_cols, success)
   if (.not.(success)) return

547
548
   ttt1 = MPI_Wtime()
   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
549
      write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
550
551
552
553
554
555
556
557
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   deallocate(e)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
558
559
560
   call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows, &
                                    mpi_comm_cols, success, THIS_REAL_ELPA_KERNEL)
   if (.not.(success)) return
561
562
   ttt1 = MPI_Wtime()
   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
563
      write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
564
565
566
567
568
569
570
571
572
573

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_real)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
   call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, mpi_comm_rows, mpi_comm_cols)
   ttt1 = MPI_Wtime()
   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
574
      write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
575
576
577
   time_evp_back = ttt1-ttts

   deallocate(tmat)
578
579
580
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_real_2stage")
#endif
581
582
1  format(a,f10.3)

583
end function solve_evp_real_2stage
584
585
586
587
588

!-------------------------------------------------------------------------------

!-------------------------------------------------------------------------------

589
function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
Andreas Marek's avatar
Andreas Marek committed
590
                                    mpi_comm_rows, mpi_comm_cols,      &
591
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626

!-------------------------------------------------------------------------------
!  solve_evp_complex_2stage: Solves the complex eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
!  a(lda,*)    Distributed matrix for which eigenvalues are to be computed.
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
!  q(ldq,*)    On output: Eigenvectors of a
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
627
628
629
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
630
   implicit none
Andreas Marek's avatar
Andreas Marek committed
631
632
   integer, intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
   integer                       :: THIS_COMPLEX_ELPA_KERNEL
633
634
635
636
637
638
639
640
641
642
643
644
   integer, intent(in)           :: na, nev, lda, ldq, nblk, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
   complex*16, intent(inout)     :: a(lda,*), q(ldq,*)
   real*8, intent(inout)         :: ev(na)

   integer                       :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes
   integer                       :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
   complex*16, allocatable       :: tmat(:,:,:)
   real*8, allocatable           :: q_real(:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
   
   logical                       :: success
645
646
647
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_complex_2stage")
#endif
Andreas Marek's avatar
Andreas Marek committed
648
649
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
650
651
652
653
654

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
655
656
657

   success = .true.

Andreas Marek's avatar
Andreas Marek committed
658
659
660
661
662
663
664
665
  if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then
      ! user defined kernel via the optional argument in the API call
      THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API
   else
      ! if kernel is not choosen via api
      ! check whether set by environment variable
      THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel()
   endif
666

Andreas Marek's avatar
Andreas Marek committed
667
668
   ! check whether choosen kernel is allowed
   if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then
669

Andreas Marek's avatar
Andreas Marek committed
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
      if(my_pe == 0) then
         write(error_unit,*) " "
         write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL)
         write(error_unit,*) "is not in the list of the allowed kernels!"
         write(error_unit,*) " "
         write(error_unit,*) "Allowed kernels are:"
         do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
            if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then
               write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
            endif
         enddo

         write(error_unit,*) " "
         write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
      endif
         THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
!      call MPI_ABORT(mpi_comm_all, mpierr)
   endif
688
689
690
691
692
693
694
695
696
697
698
699
   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

   nbw = (31/nblk+1)*nblk

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
700
701
   call bandred_complex(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
                        tmat, success)
702
703
704
705
706
707
   if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
     call timer%stop()
#endif
     return
   endif
708
709
   ttt1 = MPI_Wtime()
   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
710
      write(error_unit,*) 'Time bandred_complex               :',ttt1-ttt0
711
712
713
714
715
716
717
718
719

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
   call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
   ttt1 = MPI_Wtime()
   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
720
      write(error_unit,*) 'Time tridiag_band_complex          :',ttt1-ttt0
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
   l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev

   allocate(q_real(l_rows,l_cols))

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
737
738
739
740
   call solve_tridi(na, nev, ev, e, q_real, ubound(q_real,1), nblk, &
                    mpi_comm_rows, mpi_comm_cols, success)
   if (.not.(success)) return

741
742
   ttt1 = MPI_Wtime()
   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times)  &
743
      write(error_unit,*) 'Time solve_tridi                   :',ttt1-ttt0
744
745
746
747
748
749
750
751
752
753
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)

   deallocate(e, q_real)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
754
   call trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq,  &
755
756
757
                                       mpi_comm_rows, mpi_comm_cols,&
                                       success,THIS_COMPLEX_ELPA_KERNEL)
   if (.not.(success)) return
758
759
   ttt1 = MPI_Wtime()
   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
760
      write(error_unit,*) 'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0
761
762
763
764
765
766
767
768
769
770

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_complex)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
   call trans_ev_band_to_full_complex(na, nev, nblk, nbw, a, lda, tmat, q, ldq, mpi_comm_rows, mpi_comm_cols)
   ttt1 = MPI_Wtime()
   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
771
      write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0
772
773
774
   time_evp_back = ttt1-ttts

   deallocate(tmat)
775
776
777
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_complex_2stage")
#endif
778
779
780

1  format(a,f10.3)

781
end function solve_evp_complex_2stage
782
783
784

!-------------------------------------------------------------------------------

785
786
subroutine bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
                        tmat, success)
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814

!-------------------------------------------------------------------------------
!  bandred_real: Reduces a distributed symmetric matrix to band form
!
!  Parameters
!
!  na          Order of matrix
!
!  a(lda,*)    Distributed matrix which should be reduced.
!              Distribution is like in Scalapack.
!              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
!              a(:,:) is overwritten on exit with the band and the Householder vectors
!              in the upper half.
!
!  lda         Leading dimension of a
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  nbw         semi bandwith of output matrix
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!
!  tmat(nbw,nbw,num_blocks)    where num_blocks = (na-1)/nbw + 1
!              Factors for the Householder vectors (returned), needed for back transformation
!
!-------------------------------------------------------------------------------
815
816
817
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
818
819
   implicit none

820
821
   integer             :: na, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols
   real*8              :: a(lda,*), tmat(nbw,nbw,*)
822

823
824
825
826
827
   integer             :: my_prow, my_pcol, np_rows, np_cols, mpierr
   integer             :: l_cols, l_rows
   integer             :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
   integer             :: istep, ncol, lch, lcx, nlc
   integer             :: tile_size, l_rows_tile, l_cols_tile
828

829
   real*8              :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
830

831
   real*8, allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:)
832

833
   integer             :: pcol, prow
834
835
836
   pcol(i) = MOD((i-1)/nblk,np_cols) !Processor col for global col number
   prow(i) = MOD((i-1)/nblk,np_rows) !Processor row for global row number

837
838
   logical, intent(out):: success

839
840
841
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("bandred_real")
#endif
842
843
844
845
   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
846
   success = .true.
847
848
849
850
851

   ! Semibandwith nbw must be a multiple of blocksize nblk

   if(mod(nbw,nblk)/=0) then
      if(my_prow==0 .and. my_pcol==0) then
852
853
         write(error_unit,*) 'ERROR: nbw=',nbw,', nblk=',nblk
         write(error_unit,*) 'ELPA2 works only for nbw==n*nblk'
854
855
         success = .false.
!         call mpi_abort(mpi_comm_world,0,mpierr)
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
      endif
   endif

   ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

   tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
   tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

   l_rows_tile = tile_size/np_rows ! local rows of a tile
   l_cols_tile = tile_size/np_cols ! local cols of a tile

   do istep = (na-1)/nbw, 1, -1

      n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step

      ! Number of local columns/rows of remaining matrix
      l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
      l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)

      ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces

      allocate(vmr(max(l_rows,1),2*n_cols))
      allocate(umc(max(l_cols,1),2*n_cols))

      allocate(vr(l_rows+1))

      vmr(1:l_rows,1:n_cols) = 0.
      vr(:) = 0
      tmat(:,:,istep) = 0

      ! Reduce current block to lower triangular form

      do lc = n_cols, 1, -1

         ncol = istep*nbw + lc ! absolute column number of householder vector
         nrow = ncol - nbw ! Absolute number of pivot row

         lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
         lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number

         tau = 0

         if(nrow == 1) exit ! Nothing to do

         cur_pcol = pcol(ncol) ! Processor column owning current block

         if(my_pcol==cur_pcol) then

            ! Get vector to be transformed; distribute last element and norm of
            ! remaining elements to all procs in current column

            vr(1:lr) = a(1:lr,lch) ! vector to be transformed

            if(my_prow==prow(nrow)) then
               aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
               aux1(2) = vr(lr)
            else
               aux1(1) = dot_product(vr(1:lr),vr(1:lr))
               aux1(2) = 0.
            endif

            call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)

            vnorm2 = aux2(1)
            vrl    = aux2(2)

            ! Householder transformation

            call hh_transform_real(vrl, vnorm2, xf, tau)

            ! Scale vr and store Householder vector for back transformation

            vr(1:lr) = vr(1:lr) * xf
            if(my_prow==prow(nrow)) then
               a(1:lr-1,lch) = vr(1:lr-1)
               a(lr,lch) = vrl
               vr(lr) = 1.
            else
               a(1:lr,lch) = vr(1:lr)
            endif

         endif

         ! Broadcast Householder vector and tau along columns

         vr(lr+1) = tau
         call MPI_Bcast(vr,lr+1,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
         vmr(1:lr,lc) = vr(1:lr)
         tau = vr(lr+1)
         tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat

         ! Transform remaining columns in current block with Householder vector

         ! Local dot product

         aux1 = 0

         nlc = 0 ! number of local columns
         do j=1,lc-1
            lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
            if(lcx>0) then
               nlc = nlc+1
               if(lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
            endif
         enddo

         ! Get global dot products
         if(nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)

         ! Transform

         nlc = 0
         do j=1,lc-1
            lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
            if(lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
            endif
         enddo

      enddo

      ! Calculate scalar products of stored Householder vectors.
      ! This can be done in different ways, we use dsyrk

      vav = 0
      if(l_rows>0) &
         call dsyrk('U','T',n_cols,l_rows,1.d0,vmr,ubound(vmr,1),0.d0,vav,ubound(vav,1))
      call symm_matrix_allreduce(n_cols,vav,ubound(vav,1),mpi_comm_rows)

      ! Calculate triangular matrix T for block Householder Transformation

      do lc=n_cols,1,-1
         tau = tmat(lc,lc,istep)
         if(lc<n_cols) then
            call dtrmv('U','T','N',n_cols-lc,tmat(lc+1,lc+1,istep),ubound(tmat,1),vav(lc+1,lc),1)
            tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
         endif
      enddo

      ! Transpose vmr -> vmc (stored in umc, second half)

      call elpa_transpose_vectors  (vmr, ubound(vmr,1), mpi_comm_rows, &
                                    umc(1,n_cols+1), ubound(umc,1), mpi_comm_cols, &
                                    1, istep*nbw, n_cols, nblk)
For faster browsing, not all history is shown. View entire blame