elpa2.F90 221 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
!    http://elpa.rzg.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".


#include "config-f90.h"

module ELPA2

! Version 1.1.2, 2011-02-21

  USE ELPA1

67
68
69
#ifdef HAVE_ISO_FORTRAN_ENV
  use iso_fortran_env, only : error_unit
#endif
70
71
72

  use elpa_pdgeqrf

73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
  implicit none

  PRIVATE ! By default, all routines contained are private

  ! The following routines are public:

  public :: solve_evp_real_2stage
  public :: solve_evp_complex_2stage

  public :: bandred_real
  public :: tridiag_band_real
  public :: trans_ev_tridi_to_band_real
  public :: trans_ev_band_to_full_real

  public :: bandred_complex
  public :: tridiag_band_complex
  public :: trans_ev_tridi_to_band_complex
  public :: trans_ev_band_to_full_complex

Andreas Marek's avatar
Andreas Marek committed
92
93
94
95
96
97
98
99
100
101
102
103
  public :: get_actual_real_kernel_name, get_actual_complex_kernel_name
  public :: REAL_ELPA_KERNEL_GENERIC, REAL_ELPA_KERNEL_GENERIC_SIMPLE, &
            REAL_ELPA_KERNEL_BGP, REAL_ELPA_KERNEL_BGQ,                &
            REAL_ELPA_KERNEL_SSE, REAL_ELPA_KERNEL_AVX_BLOCK2,         &
            REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6

  public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, &
            COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ,                &
            COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_AVX_BLOCK1,         &
            COMPLEX_ELPA_KERNEL_AVX_BLOCK2

  public :: print_available_real_kernels, print_available_complex_kernels
104
105
106
107
#ifndef HAVE_ISO_FORTRAN_ENV
  integer, parameter :: error_unit = 6
#endif

Andreas Marek's avatar
Andreas Marek committed
108
109
110
111
112
113
114
115
116
117
118
119

  integer, parameter :: number_of_real_kernels           = 8
  integer, parameter :: REAL_ELPA_KERNEL_GENERIC         = 1
  integer, parameter :: REAL_ELPA_KERNEL_GENERIC_SIMPLE  = 2
  integer, parameter :: REAL_ELPA_KERNEL_BGP             = 3
  integer, parameter :: REAL_ELPA_KERNEL_BGQ             = 4
  integer, parameter :: REAL_ELPA_KERNEL_SSE             = 5
  integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK2      = 6
  integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK4      = 7
  integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK6      = 8

#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
120
  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
121
#else
122
  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
123
124
#endif
  character(35), parameter, dimension(number_of_real_kernels) :: &
125
126
127
128
129
130
131
132
  REAL_ELPA_KERNEL_NAMES =    (/"REAL_ELPA_KERNEL_GENERIC         ", &
                                "REAL_ELPA_KERNEL_GENERIC_SIMPLE  ", &
                                "REAL_ELPA_KERNEL_BGP             ", &
                                "REAL_ELPA_KERNEL_BGQ             ", &
                                "REAL_ELPA_KERNEL_SSE             ", &
                                "REAL_ELPA_KERNEL_AVX_BLOCK2      ", &
                                "REAL_ELPA_KERNEL_AVX_BLOCK4      ", &
                                "REAL_ELPA_KERNEL_AVX_BLOCK6      "/)
Andreas Marek's avatar
Andreas Marek committed
133
134
135
136
137
138
139
140
141
142
143

  integer, parameter :: number_of_complex_kernels           = 7
  integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC         = 1
  integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE  = 2
  integer, parameter :: COMPLEX_ELPA_KERNEL_BGP             = 3
  integer, parameter :: COMPLEX_ELPA_KERNEL_BGQ             = 4
  integer, parameter :: COMPLEX_ELPA_KERNEL_SSE             = 5
  integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK1      = 6
  integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK2      = 7

#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
144
  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
145
#else
146
  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
147
148
#endif
  character(35), parameter, dimension(number_of_complex_kernels) :: &
149
150
151
152
153
154
155
  COMPLEX_ELPA_KERNEL_NAMES = (/"COMPLEX_ELPA_KERNEL_GENERIC         ", &
                                "COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE  ", &
                                "COMPLEX_ELPA_KERNEL_BGP             ", &
                                "COMPLEX_ELPA_KERNEL_BGQ             ", &
                                "COMPLEX_ELPA_KERNEL_SSE             ", &
                                "COMPLEX_ELPA_KERNEL_AVX_BLOCK1      ", &
                                "COMPLEX_ELPA_KERNEL_AVX_BLOCK2      "/)
Andreas Marek's avatar
Andreas Marek committed
156
157
158
159
160
161
162

  integer, parameter                                    ::             &
           AVAILABLE_REAL_ELPA_KERNELS(number_of_real_kernels) =       &
                                      (/                               &
#if WITH_REAL_GENERIC_KERNEL
                                        1                              &
#else
163
                                        0                              &
Andreas Marek's avatar
Andreas Marek committed
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#endif
#if WITH_REAL_GENERIC_SIMPLE_KERNEL
                                          ,1                           &
#else
                                          ,0                           &
#endif
#if WITH_REAL_BGP_KERNEL
                                            ,1                         &
#else
                                            ,0                         &
#endif
#if WITH_REAL_BGQ_KERNEL
                                              ,1                       &
#else
                                              ,0                       &
#endif
#if WITH_REAL_SSE_KERNEL
                                                ,1                     &
#else
                                                ,0                     &
#endif
#if WITH_REAL_AVX_BLOCK2_KERNEL
                                                  ,1                   &
#else
                                                  ,0                   &
#endif
#if WITH_REAL_AVX_BLOCK4_KERNEL
                                                    ,1                 &
#else
                                                    ,0                 &
#endif
#if WITH_REAL_AVX_BLOCK6_KERNEL
                                                      ,1               &
#else
                                                      ,0               &
#endif
                                                       /)

  integer, parameter ::                                                   &
           AVAILABLE_COMPLEX_ELPA_KERNELS(number_of_complex_kernels) =    &
                                      (/                                  &
#if WITH_COMPLEX_GENERIC_KERNEL
                                        1                                 &
#else
                                        0                                 &
#endif
#if WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
                                          ,1                              &
#else
                                          ,0                              &
#endif
#if WITH_COMPLEX_BGP_KERNEL
                                            ,1                            &
#else
                                            ,0                            &
#endif
#if WITH_COMPLEX_BGQ_KERNEL
                                              ,1                          &
#else
                                              ,0                          &
#endif
#if WITH_COMPLEX_SSE_KERNEL
                                                ,1                        &
#else
                                                ,0                        &
#endif
230
#if WITH_COMPLEX_AVX_BLOCK1_KERNEL
Andreas Marek's avatar
Andreas Marek committed
231
232
233
234
                                                  ,1                      &
#else
                                                  ,0                      &
#endif
235
#if WITH_COMPLEX_AVX_BLOCK2_KERNEL
Andreas Marek's avatar
Andreas Marek committed
236
237
238
239
240
                                                    ,1                    &
#else
                                                    ,0                    &
#endif
                                                   /)
241
242
243
244
245
246
247

  public :: band_band_real
  public :: divide_band

  integer, public :: which_qr_decomposition = 1     ! defines, which QR-decomposition algorithm will be used
                                                    ! 0 for unblocked
                                                    ! 1 for blocked (maxrank: nblk)
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
!-------------------------------------------------------------------------------

  ! The following array contains the Householder vectors of the
  ! transformation band -> tridiagonal.
  ! It is allocated and set in tridiag_band_real and used in
  ! trans_ev_tridi_to_band_real.
  ! It must be deallocated by the user after trans_ev_tridi_to_band_real!

  real*8, allocatable :: hh_trans_real(:,:)
  complex*16, allocatable :: hh_trans_complex(:,:)

!-------------------------------------------------------------------------------

  include 'mpif.h'


!******
contains
266

Andreas Marek's avatar
Andreas Marek committed
267
subroutine print_available_real_kernels
Andreas Marek's avatar
Andreas Marek committed
268
269
270
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
Andreas Marek's avatar
Andreas Marek committed
271
272
273
  implicit none

  integer :: i
274

Andreas Marek's avatar
Andreas Marek committed
275
276
277
278
#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("print_available_real_kernels")
#endif

Andreas Marek's avatar
Andreas Marek committed
279
  do i=1, number_of_real_kernels
280
281
282
    if (AVAILABLE_REAL_ELPA_KERNELS(i) .eq. 1) then
      write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
    endif
Andreas Marek's avatar
Andreas Marek committed
283
284
285
286
287
  enddo
  write(error_unit,*) " "
  write(error_unit,*) " At the moment the following kernel would be choosen:"
  write(error_unit,*) get_actual_real_kernel_name()

Andreas Marek's avatar
Andreas Marek committed
288
289
290
#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("print_available_real_kernels")
#endif
Andreas Marek's avatar
Andreas Marek committed
291
292
293
294

end subroutine print_available_real_kernels

subroutine print_available_complex_kernels
Andreas Marek's avatar
Andreas Marek committed
295
296
297
#ifdef HAVE_DETAILED_TIMINGS
  use timings
#endif
Andreas Marek's avatar
Andreas Marek committed
298
299
300
301

  implicit none

  integer :: i
Andreas Marek's avatar
Andreas Marek committed
302
303
304
#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("print_available_complex_kernels")
#endif
305

306
  do i=1, number_of_complex_kernels
307
308
309
    if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .eq. 1) then
       write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
    endif
Andreas Marek's avatar
Andreas Marek committed
310
311
312
  enddo
  write(error_unit,*) " "
  write(error_unit,*) " At the moment the following kernel would be choosen:"
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
313
  write(error_unit,*) get_actual_complex_kernel_name()
Andreas Marek's avatar
Andreas Marek committed
314

Andreas Marek's avatar
Andreas Marek committed
315
316
317
#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("print_available_complex_kernels")
#endif
Andreas Marek's avatar
Andreas Marek committed
318
319
320

end subroutine print_available_complex_kernels

321
function get_actual_real_kernel() result(actual_kernel)
Andreas Marek's avatar
Andreas Marek committed
322
323
324
325
#ifdef HAVE_DETAILED_TIMINGS
  use timings
#endif
  implicit none
Andreas Marek's avatar
Andreas Marek committed
326
327

  integer :: actual_kernel
328

Andreas Marek's avatar
Andreas Marek committed
329
330
331
332
333
#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("get_actual_real_kernel")
#endif


Andreas Marek's avatar
Andreas Marek committed
334
335
336
  ! if kernel is not choosen via api
  ! check whether set by environment variable
  actual_kernel = real_kernel_via_environment_variable()
337

Andreas Marek's avatar
Andreas Marek committed
338
  if (actual_kernel .eq. 0) then
339
340
    ! if not then set default kernel
    actual_kernel = DEFAULT_REAL_ELPA_KERNEL
Andreas Marek's avatar
Andreas Marek committed
341
  endif
Andreas Marek's avatar
Andreas Marek committed
342
343
344
345
346

#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("get_actual_real_kernel")
#endif

Andreas Marek's avatar
Andreas Marek committed
347
348
end function get_actual_real_kernel

349
function get_actual_real_kernel_name() result(actual_kernel_name)
Andreas Marek's avatar
Andreas Marek committed
350
351
352
353
#ifdef HAVE_DETAILED_TIMINGS
  use timings
#endif
  implicit none
Andreas Marek's avatar
Andreas Marek committed
354
355
356

  character(35) :: actual_kernel_name
  integer       :: actual_kernel
Andreas Marek's avatar
Andreas Marek committed
357
358
359
360
361

#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("get_actual_real_kernel_name")
#endif

Andreas Marek's avatar
Andreas Marek committed
362
363
  actual_kernel = get_actual_real_kernel()
  actual_kernel_name = REAL_ELPA_KERNEL_NAMES(actual_kernel)
Andreas Marek's avatar
Andreas Marek committed
364
365
366
367
368

#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("get_actual_real_kernel_name")
#endif

Andreas Marek's avatar
Andreas Marek committed
369
370
end function get_actual_real_kernel_name

371
function get_actual_complex_kernel() result(actual_kernel)
Andreas Marek's avatar
Andreas Marek committed
372
373
374
375
#ifdef HAVE_DETAILED_TIMINGS
  use timings
#endif
  implicit none
Andreas Marek's avatar
Andreas Marek committed
376
  integer :: actual_kernel
377

Andreas Marek's avatar
Andreas Marek committed
378
379
380
381
382
#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("get_actual_complex_kernel")
#endif


Andreas Marek's avatar
Andreas Marek committed
383
384
385
  ! if kernel is not choosen via api
  ! check whether set by environment variable
  actual_kernel = complex_kernel_via_environment_variable()
386

Andreas Marek's avatar
Andreas Marek committed
387
  if (actual_kernel .eq. 0) then
388
389
    ! if not then set default kernel
    actual_kernel = DEFAULT_COMPLEX_ELPA_KERNEL
Andreas Marek's avatar
Andreas Marek committed
390
  endif
Andreas Marek's avatar
Andreas Marek committed
391
392
393
394
395

#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("get_actual_complex_kernel")
#endif

Andreas Marek's avatar
Andreas Marek committed
396
397
end function get_actual_complex_kernel

398
function get_actual_complex_kernel_name() result(actual_kernel_name)
Andreas Marek's avatar
Andreas Marek committed
399
400
401
402
#ifdef HAVE_DETAILED_TIMINGS
  use timings
#endif
  implicit none
Andreas Marek's avatar
Andreas Marek committed
403
404
  character(35) :: actual_kernel_name
  integer       :: actual_kernel
Andreas Marek's avatar
Andreas Marek committed
405
406
407
408
409

#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("get_actual_complex_kernel_name")
#endif

Andreas Marek's avatar
Andreas Marek committed
410
411
  actual_kernel = get_actual_complex_kernel()
  actual_kernel_name = COMPLEX_ELPA_KERNEL_NAMES(actual_kernel)
Andreas Marek's avatar
Andreas Marek committed
412
413
414
415
416
417


#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("get_actual_complex_kernel_name")
#endif

Andreas Marek's avatar
Andreas Marek committed
418
419
420
end function get_actual_complex_kernel_name

function check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL) result(err)
Andreas Marek's avatar
Andreas Marek committed
421
422
423
#ifdef HAVE_DETAILED_TIMINGS
  use timings
#endif
Andreas Marek's avatar
Andreas Marek committed
424
425
  implicit none
  integer, intent(in) :: THIS_REAL_ELPA_KERNEL
426

Andreas Marek's avatar
Andreas Marek committed
427
428
  logical             :: err

Andreas Marek's avatar
Andreas Marek committed
429
430
431
#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("check_allowed_real_kernels")
#endif
Andreas Marek's avatar
Andreas Marek committed
432
433
434
435
  err = .false.

  if (AVAILABLE_REAL_ELPA_KERNELS(THIS_REAL_ELPA_KERNEL) .ne. 1) err=.true.

Andreas Marek's avatar
Andreas Marek committed
436
437
438
439
#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("check_allowed_real_kernels")
#endif

Andreas Marek's avatar
Andreas Marek committed
440
441
442
end function check_allowed_real_kernels

function check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL) result(err)
Andreas Marek's avatar
Andreas Marek committed
443
444
445
#ifdef HAVE_DETAILED_TIMINGS
  use timings
#endif
Andreas Marek's avatar
Andreas Marek committed
446
447
448
449
  implicit none
  integer, intent(in) :: THIS_COMPLEX_ELPA_KERNEL

  logical             :: err
Andreas Marek's avatar
Andreas Marek committed
450
451
452
#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("check_allowed_complex_kernels")
#endif
Andreas Marek's avatar
Andreas Marek committed
453
454
455
  err = .false.

  if (AVAILABLE_COMPLEX_ELPA_KERNELS(THIS_COMPLEX_ELPA_KERNEL) .ne. 1) err=.true.
Andreas Marek's avatar
Andreas Marek committed
456
457
458
459
460

#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("check_allowed_complex_kernels")
#endif

Andreas Marek's avatar
Andreas Marek committed
461
462
end function check_allowed_complex_kernels

463
function qr_decomposition_via_environment_variable(useQR) result(isSet)
Andreas Marek's avatar
Andreas Marek committed
464
465
466
#ifdef HAVE_DETAILED_TIMINGS
  use timings
#endif
467
468
469
470
471
  implicit none
  logical, intent(out) :: useQR
  logical              :: isSet
  CHARACTER(len=255)   :: ELPA_QR_DECOMPOSITION

Andreas Marek's avatar
Andreas Marek committed
472
473
474
475
#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("qr_decomposition_via_environment_variable")
#endif

476
477
478
479
480
481
482
483
484
485
486
487
488
489
  isSet = .false.

#if defined(HAVE_ENVIRONMENT_CHECKING)
  call get_environment_variable("ELPA_QR_DECOMPOSITION",ELPA_QR_DECOMPOSITION)
#endif
  if (trim(ELPA_QR_DECOMPOSITION) .eq. "yes") then
    useQR = .true.
    isSet = .true.
  endif
  if (trim(ELPA_QR_DECOMPOSITION) .eq. "no") then
    useQR = .false.
    isSet = .true.
  endif

Andreas Marek's avatar
Andreas Marek committed
490
491
492
493
#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("qr_decomposition_via_environment_variable")
#endif

494
495
496
end function qr_decomposition_via_environment_variable


497
function real_kernel_via_environment_variable() result(kernel)
Andreas Marek's avatar
Andreas Marek committed
498
499
500
#ifdef HAVE_DETAILED_TIMINGS
  use timings
#endif
Andreas Marek's avatar
Andreas Marek committed
501
502
503
504
505
  implicit none
  integer :: kernel
  CHARACTER(len=255) :: REAL_KERNEL_ENVIRONMENT
  integer :: i

Andreas Marek's avatar
Andreas Marek committed
506
507
508
509
#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("real_kernel_via_environment_variable")
#endif

Andreas Marek's avatar
Andreas Marek committed
510
511
512
513
514
#if defined(HAVE_ENVIRONMENT_CHECKING)
  call get_environment_variable("REAL_ELPA_KERNEL",REAL_KERNEL_ENVIRONMENT)
#endif
  do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
!     if (trim(dummy_char) .eq. trim(REAL_ELPA_KERNEL_NAMES(i))) then
515
516
517
518
519
520
    if (trim(REAL_KERNEL_ENVIRONMENT) .eq. trim(REAL_ELPA_KERNEL_NAMES(i))) then
      kernel = i
      exit
    else
      kernel = 0
    endif
Andreas Marek's avatar
Andreas Marek committed
521
  enddo
Andreas Marek's avatar
Andreas Marek committed
522
523
524
525
526

#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("real_kernel_via_environment_variable")
#endif

Andreas Marek's avatar
Andreas Marek committed
527
528
end function real_kernel_via_environment_variable

529
function complex_kernel_via_environment_variable() result(kernel)
Andreas Marek's avatar
Andreas Marek committed
530
531
532
#ifdef HAVE_DETAILED_TIMINGS
  use timings
#endif
Andreas Marek's avatar
Andreas Marek committed
533
534
535
536
537
  implicit none
  integer :: kernel

  CHARACTER(len=255) :: COMPLEX_KERNEL_ENVIRONMENT
  integer :: i
Andreas Marek's avatar
Andreas Marek committed
538
539
540
541
542

#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("complex_kernel_via_environment_variable")
#endif

Andreas Marek's avatar
Andreas Marek committed
543
544
545
546
547
#if defined(HAVE_ENVIRONMENT_CHECKING)
  call get_environment_variable("COMPLEX_ELPA_KERNEL",COMPLEX_KERNEL_ENVIRONMENT)
#endif

  do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
548
549
550
551
552
553
    if (trim(COMPLEX_ELPA_KERNEL_NAMES(i)) .eq. trim(COMPLEX_KERNEL_ENVIRONMENT)) then
      kernel = i
      exit
    else
      kernel = 0
    endif
Andreas Marek's avatar
Andreas Marek committed
554
555
  enddo

Andreas Marek's avatar
Andreas Marek committed
556
557
558
559
560
#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("complex_kernel_via_environment_variable")
#endif


Andreas Marek's avatar
Andreas Marek committed
561
562
end function complex_kernel_via_environment_variable

563
564
565
566
function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk,        &
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
                                 useQR) result(success)
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601

!-------------------------------------------------------------------------------
!  solve_evp_real_2stage: Solves the real eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
!  a(lda,*)    Distributed matrix for which eigenvalues are to be computed.
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
!  q(ldq,*)    On output: Eigenvectors of a
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
602
603
604
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
605
   implicit none
606
607
   logical, intent(in), optional :: useQR
   logical                       :: useQRActual, useQREnvironment
Andreas Marek's avatar
Andreas Marek committed
608
   integer, intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
609
   integer                       :: THIS_REAL_ELPA_KERNEL
610

611
612
613
   integer, intent(in)           :: na, nev, lda, ldq, nblk, mpi_comm_rows, &
                                    mpi_comm_cols, mpi_comm_all
   real*8, intent(inout)         :: a(lda,*), ev(na), q(ldq,*)
614

615
616
617
618
619
620
   integer                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer                       :: nbw, num_blocks
   real*8, allocatable           :: tmat(:,:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
   logical                       :: success
621

622
623
624
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_real_2stage")
#endif
625
626
627
628
629
630
631
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
632

633
634
   success = .true.

635
636
637
638
639
640
641
642
643
644
645
646
647
648
   useQRActual = .false.

   ! set usage of qr decomposition via API call
   if (present(useQR)) then
     if (useQR) useQRActual = .true.
     if (useQR .and. na .lt. 800) useQRActual = .false.
     if (.not.(useQR)) useQRACtual = .false.
   endif

   ! overwrite this with environment variable settings
   if (qr_decomposition_via_environment_variable(useQREnvironment)) then
     useQRActual = useQREnvironment
   endif

649
650
651
   if (present(THIS_REAL_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
652
   else
653

654
655
656
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
Andreas Marek's avatar
Andreas Marek committed
657
658
659
660
   endif

   ! check whether choosen kernel is allowed
   if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
661

662
663
664
665
666
667
668
669
670
671
672
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
673

674
675
676
677
       write(error_unit,*) " "
       write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !"
     endif
     THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
678
679

   endif
680
681
682
683
684
685
686
687
688
689
690
691
692

   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

   nbw = (31/nblk+1)*nblk

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
693
   call bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
694
                     tmat, success, useQRActual)
695
   if (.not.(success)) return
696
   ttt1 = MPI_Wtime()
697
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
698
      write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
699
700
701
702
703
704

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
705
706
   call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, mpi_comm_rows, &
                          mpi_comm_cols, mpi_comm_all)
707
   ttt1 = MPI_Wtime()
708
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
709
      write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
710
711
712
713
714
715
716
717
718
719

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
720
721
722
723
   call solve_tridi(na, nev, ev, e, q, ldq, nblk, mpi_comm_rows,  &
                    mpi_comm_cols, success)
   if (.not.(success)) return

724
   ttt1 = MPI_Wtime()
725
726
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
     write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
727
728
729
730
731
732
733
734
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   deallocate(e)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
735
736
737
   call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows, &
                                    mpi_comm_cols, success, THIS_REAL_ELPA_KERNEL)
   if (.not.(success)) return
738
   ttt1 = MPI_Wtime()
739
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
740
      write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
741
742
743
744
745
746
747

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_real)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
748
749
   call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, mpi_comm_rows, &
                                   mpi_comm_cols, useQRActual)
750
   ttt1 = MPI_Wtime()
751
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
752
      write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
753
754
755
   time_evp_back = ttt1-ttts

   deallocate(tmat)
756
757
758
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_real_2stage")
#endif
759
760
1  format(a,f10.3)

761
end function solve_evp_real_2stage
762
763
764
765
766

!-------------------------------------------------------------------------------

!-------------------------------------------------------------------------------

767
function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
Andreas Marek's avatar
Andreas Marek committed
768
                                    mpi_comm_rows, mpi_comm_cols,      &
769
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804

!-------------------------------------------------------------------------------
!  solve_evp_complex_2stage: Solves the complex eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
!  a(lda,*)    Distributed matrix for which eigenvalues are to be computed.
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
!  q(ldq,*)    On output: Eigenvectors of a
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
805
806
807
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
808
   implicit none
Andreas Marek's avatar
Andreas Marek committed
809
810
   integer, intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
   integer                       :: THIS_COMPLEX_ELPA_KERNEL
811
812
813
814
815
816
817
818
819
820
   integer, intent(in)           :: na, nev, lda, ldq, nblk, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
   complex*16, intent(inout)     :: a(lda,*), q(ldq,*)
   real*8, intent(inout)         :: ev(na)

   integer                       :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes
   integer                       :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
   complex*16, allocatable       :: tmat(:,:,:)
   real*8, allocatable           :: q_real(:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
821

822
   logical                       :: success
823
824
825
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_complex_2stage")
#endif
Andreas Marek's avatar
Andreas Marek committed
826
827
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
828
829
830
831
832

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
833
834
835

   success = .true.

836
837
838
   if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
839
   else
840
841
842
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel()
Andreas Marek's avatar
Andreas Marek committed
843
   endif
844

Andreas Marek's avatar
Andreas Marek committed
845
846
   ! check whether choosen kernel is allowed
   if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then
847

848
849
850
851
852
853
854
855
856
857
858
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
859

860
861
862
863
       write(error_unit,*) " "
       write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
     endif
     THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
864
865
!      call MPI_ABORT(mpi_comm_all, mpierr)
   endif
866
867
868
869
870
871
872
873
874
875
876
877
   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

   nbw = (31/nblk+1)*nblk

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
878
879
   call bandred_complex(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
                        tmat, success)
880
881
882
883
884
885
   if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
     call timer%stop()
#endif
     return
   endif
886
   ttt1 = MPI_Wtime()
887
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
888
      write(error_unit,*) 'Time bandred_complex               :',ttt1-ttt0
889
890
891
892
893
894
895
896

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
   call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
   ttt1 = MPI_Wtime()
897
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
898
      write(error_unit,*) 'Time tridiag_band_complex          :',ttt1-ttt0
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
   l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev

   allocate(q_real(l_rows,l_cols))

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
915
916
917
918
   call solve_tridi(na, nev, ev, e, q_real, ubound(q_real,1), nblk, &
                    mpi_comm_rows, mpi_comm_cols, success)
   if (.not.(success)) return

919
   ttt1 = MPI_Wtime()
920
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times)  &
921
      write(error_unit,*) 'Time solve_tridi                   :',ttt1-ttt0
922
923
924
925
926
927
928
929
930
931
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)

   deallocate(e, q_real)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
932
   call trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq,  &
933
934
935
                                       mpi_comm_rows, mpi_comm_cols,&
                                       success,THIS_COMPLEX_ELPA_KERNEL)
   if (.not.(success)) return
936
   ttt1 = MPI_Wtime()
937
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
938
      write(error_unit,*) 'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0
939
940
941
942
943
944
945
946
947

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_complex)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
   call trans_ev_band_to_full_complex(na, nev, nblk, nbw, a, lda, tmat, q, ldq, mpi_comm_rows, mpi_comm_cols)
   ttt1 = MPI_Wtime()
948
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
949
      write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0
950
951
952
   time_evp_back = ttt1-ttts

   deallocate(tmat)
953
954
955
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_complex_2stage")
#endif
956
957
958

1  format(a,f10.3)

959
end function solve_evp_complex_2stage
960
961
962

!-------------------------------------------------------------------------------

963
subroutine bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
964
                        tmat, success, useQR)
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992

!-------------------------------------------------------------------------------
!  bandred_real: Reduces a distributed symmetric matrix to band form
!
!  Parameters
!
!  na          Order of matrix
!
!  a(lda,*)    Distributed matrix which should be reduced.
!              Distribution is like in Scalapack.
!              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
!              a(:,:) is overwritten on exit with the band and the Householder vectors
!              in the upper half.
!
!  lda         Leading dimension of a
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  nbw         semi bandwith of output matrix
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!
!  tmat(nbw,nbw,num_blocks)    where num_blocks = (na-1)/nbw + 1
!              Factors for the Householder vectors (returned), needed for back transformation
!
!-------------------------------------------------------------------------------
993
994
995
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
996
997
   implicit none

998
999
   integer             :: na, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols
   real*8              :: a(lda,*), tmat(nbw,nbw,*)
1000

1001
1002
1003
1004
1005
   integer             :: my_prow, my_pcol, np_rows, np_cols, mpierr
   integer             :: l_cols, l_rows
   integer             :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
   integer             :: istep, ncol, lch, lcx, nlc
   integer             :: tile_size, l_rows_tile, l_cols_tile
1006

1007
   real*8              :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
1008

1009
   real*8, allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:)
1010

1011
   integer             :: pcol, prow
1012
1013
1014
1015
1016
1017

   ! needed for blocked QR decomposition
   integer             :: PQRPARAM(11), work_size
   real*8              :: dwork_size(1)
   real*8, allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)

1018
1019
1020
   pcol(i) = MOD((i-1)/nblk,np_cols) !Processor col for global col number
   prow(i) = MOD((i-1)/nblk,np_rows) !Processor row for global row number

1021
1022
   logical, intent(out):: success

1023
1024
   logical, intent(in) :: useQR

1025
1026
1027
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("bandred_real")
#endif
1028
1029
1030
1031
   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
1032
   success = .true.
1033
1034
1035

   ! Semibandwith nbw must be a multiple of blocksize nblk

1036
1037
1038
1039
1040
   if (mod(nbw,nblk)/=0) then
     if (my_prow==0 .and. my_pcol==0) then
       write(error_unit,*) 'ERROR: nbw=',nbw,', nblk=',nblk
       write(error_unit,*) 'ELPA2 works only for nbw==n*nblk'
       success = .false.
1041
!         call mpi_abort(mpi_comm_world,0,mpierr)
1042
     endif
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
   endif

   ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

   tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
   tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

   l_rows_tile = tile_size/np_rows ! local rows of a tile
   l_cols_tile = tile_size/np_cols ! local cols of a tile

1053
1054
1055
1056
1057
1058
1059
   if (useQR) then
     if (which_qr_decomposition == 1) then
       call qr_pqrparam_init(pqrparam,    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
       allocate(tauvector(na))
       allocate(blockheuristic(nblk))
       l_rows = local_index(na, my_prow, np_rows, nblk, -1)
       allocate(vmr(max(l_rows,1),na))
1060

1061
       call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), tmat(1,1,1), nbw, dwork_size(1), -1, na, &
1062
                             nbw, nblk, nblk, na, na, 1, 0, PQRPARAM, mpi_comm_rows, mpi_comm_cols, blockheuristic)
1063
1064
       work_size = dwork_size(1)
       allocate(work_blocked(work_size))
1065

1066
1067
1068
       work_blocked = 0.0d0
       deallocate(vmr)
     endif
1069
1070
   endif

1071
1072
   do istep = (na-1)/nbw, 1, -1

1073
     n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
1074

1075
1076
1077
     ! Number of local columns/rows of remaining matrix
     l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
     l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
1078

1079
     ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
1080

1081
1082
     allocate(vmr(max(l_rows,1),2*n_cols))
     allocate(umc(max(l_cols,1),2*n_cols))
1083

1084
     allocate(vr(l_rows+1))
1085

1086
1087
1088
     vmr(1:l_rows,1:n_cols) = 0.
     vr(:) = 0
     tmat(:,:,istep) = 0
1089

1090
     ! Reduce current block to lower triangular form
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100

     if (useQR) then
       if (which_qr_decomposition == 1) then
         call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), &
                                  tmat(1,1,istep), nbw, work_blocked,       &
                                  work_size, na, n_cols, nblk, nblk,        &
                                  istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                  0, PQRPARAM, mpi_comm_rows, mpi_comm_cols,&
                                  blockheuristic)
       endif
1101
     else
1102

1103
       do lc = n_cols, 1, -1
1104

1105
1106
         ncol = istep*nbw + lc ! absolute column number of householder vector
         nrow = ncol - nbw ! Absolute number of pivot row
1107

1108
1109
         lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
         lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
1110

1111
         tau = 0
1112

1113
         if (nrow == 1) exit ! Nothing to do
1114

1115
         cur_pcol = pcol(ncol) ! Processor column owning current block
1116

1117
         if (my_pcol==cur_pcol) then
1118

1119
1120
           ! Get vector to be transformed; distribute last element and norm of
           ! remaining elements to all procs in current column
1121

1122
           vr(1:lr) = a(1:lr,lch) ! vector to be transformed
1123

1124
1125
1126
1127
1128
1129
1130
           if (my_prow==prow(nrow)) then
             aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
             aux1(2) = vr(lr)
           else
             aux1(1) = dot_product(vr(1:lr),vr(1:lr))
             aux1(2) = 0.
           endif
1131

1132
           call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
1133

1134
1135
           vnorm2 = aux2(1)
           vrl    = aux2(2)
1136

1137
           ! Householder transformation
1138

1139
           call hh_transform_real(vrl, vnorm2, xf, tau)
1140

1141
           ! Scale vr and store Householder vector for back transformation
1142

1143
1144
1145
1146
1147
1148
1149
           vr(1:lr) = vr(1:lr) * xf
           if (my_prow==prow(nrow)) then
             a(1:lr-1,lch) = vr(1:lr-1)
             a(lr,lch) = vrl
             vr(lr) = 1.
           else
             a(1:lr,lch) = vr(1:lr)
1150
           endif
1151

1152
         endif
1153

1154
         ! Broadcast Householder vector and tau along columns
1155

1156
1157
1158
1159
1160
         vr(lr+1) = tau
         call MPI_Bcast(vr,lr+1,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
         vmr(1:lr,lc) = vr(1:lr)
         tau = vr(lr+1)
         tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
1161

1162
1163
         ! Transform remaining columns in current block with Householder vector
         ! Local dot product
1164

1165
         aux1 = 0
1166

1167
1168
1169
1170
1171
1172
1173
1174
         nlc = 0 ! number of local columns
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             nlc = nlc+1
             if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
           endif
         enddo
1175

1176
1177
         ! Get global dot products
         if (nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
1178

1179
         ! Transform
1180

1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
         nlc = 0
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             nlc = nlc+1
             a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
           endif
         enddo

       enddo
1191

1192
1193
       ! Calculate scalar products of stored Householder vectors.
       ! This can be done in different ways, we use dsyrk
1194

1195
1196
       vav = 0
       if (l_rows>0) &
1197
           call dsyrk('U','T',n_cols,l_rows,1.d0,vmr,ubound(vmr,1),0.d0,vav,ubound(vav,1))
1198
       call symm_matrix_allreduce(n_cols,vav,ubound(vav,1),mpi_comm_rows)
1199

1200
       ! Calculate triangular matrix T for block Householder Transformation
1201

1202
1203
1204
1205
1206
1207
1208
       do lc=n_cols,1,-1
         tau = tmat(lc,lc,istep)
         if (lc<n_cols) then
           call dtrmv('U','T','N',n_cols-lc,tmat(lc+1,lc+1,istep),ubound(tmat,1),vav(lc+1,lc),1)
           tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
         endif
       enddo
1209
     endif
1210

1211
    ! Transpose vmr -> vmc (stored in umc, second half)
1212

1213
    call elpa_transpose_vectors  (vmr, ubound(vmr,1), mpi_comm_rows, &
1214
1215
1216
                                    umc(1,n_cols+1), ubound(umc,1), mpi_comm_cols, &
                                    1, istep*nbw, n_cols, nblk)

1217
1218
1219
1220
    ! Calculate umc = A**T * vmr
    ! Note that the distributed A has to be transposed
    ! Opposed to direct tridiagonalization there is no need to use the cache locality
    ! of the tiles, so we can use strips of the matrix
1221

1222
1223
1224
1225
    umc(1:l_cols,1:n_cols) = 0.d0
    vmr(1:l_rows,n_cols+1:2*n_cols) = 0
    if (l_cols>0 .and. l_rows>0) then
      do i=0,(istep*nbw-1)/tile_size
1226

1227
1228