elpa2.F90 221 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
!    http://elpa.rzg.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".


#include "config-f90.h"

module ELPA2

! Version 1.1.2, 2011-02-21

  USE ELPA1

67
68
69
#ifdef HAVE_ISO_FORTRAN_ENV
  use iso_fortran_env, only : error_unit
#endif
70
71
72

  use elpa_pdgeqrf

73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
  implicit none

  PRIVATE ! By default, all routines contained are private

  ! The following routines are public:

  public :: solve_evp_real_2stage
  public :: solve_evp_complex_2stage

  public :: bandred_real
  public :: tridiag_band_real
  public :: trans_ev_tridi_to_band_real
  public :: trans_ev_band_to_full_real

  public :: bandred_complex
  public :: tridiag_band_complex
  public :: trans_ev_tridi_to_band_complex
  public :: trans_ev_band_to_full_complex

Andreas Marek's avatar
Andreas Marek committed
92
93
94
95
96
97
98
99
100
101
102
103
  public :: get_actual_real_kernel_name, get_actual_complex_kernel_name
  public :: REAL_ELPA_KERNEL_GENERIC, REAL_ELPA_KERNEL_GENERIC_SIMPLE, &
            REAL_ELPA_KERNEL_BGP, REAL_ELPA_KERNEL_BGQ,                &
            REAL_ELPA_KERNEL_SSE, REAL_ELPA_KERNEL_AVX_BLOCK2,         &
            REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6

  public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, &
            COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ,                &
            COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_AVX_BLOCK1,         &
            COMPLEX_ELPA_KERNEL_AVX_BLOCK2

  public :: print_available_real_kernels, print_available_complex_kernels
104
105
106
107
#ifndef HAVE_ISO_FORTRAN_ENV
  integer, parameter :: error_unit = 6
#endif

Andreas Marek's avatar
Andreas Marek committed
108
109
110
111
112
113
114
115
116
117
118
119

  integer, parameter :: number_of_real_kernels           = 8
  integer, parameter :: REAL_ELPA_KERNEL_GENERIC         = 1
  integer, parameter :: REAL_ELPA_KERNEL_GENERIC_SIMPLE  = 2
  integer, parameter :: REAL_ELPA_KERNEL_BGP             = 3
  integer, parameter :: REAL_ELPA_KERNEL_BGQ             = 4
  integer, parameter :: REAL_ELPA_KERNEL_SSE             = 5
  integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK2      = 6
  integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK4      = 7
  integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK6      = 8

#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
120
  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
121
#else
122
  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
123
124
#endif
  character(35), parameter, dimension(number_of_real_kernels) :: &
125
126
127
128
129
130
131
132
  REAL_ELPA_KERNEL_NAMES =    (/"REAL_ELPA_KERNEL_GENERIC         ", &
                                "REAL_ELPA_KERNEL_GENERIC_SIMPLE  ", &
                                "REAL_ELPA_KERNEL_BGP             ", &
                                "REAL_ELPA_KERNEL_BGQ             ", &
                                "REAL_ELPA_KERNEL_SSE             ", &
                                "REAL_ELPA_KERNEL_AVX_BLOCK2      ", &
                                "REAL_ELPA_KERNEL_AVX_BLOCK4      ", &
                                "REAL_ELPA_KERNEL_AVX_BLOCK6      "/)
Andreas Marek's avatar
Andreas Marek committed
133
134
135
136
137
138
139
140
141
142
143

  integer, parameter :: number_of_complex_kernels           = 7
  integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC         = 1
  integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE  = 2
  integer, parameter :: COMPLEX_ELPA_KERNEL_BGP             = 3
  integer, parameter :: COMPLEX_ELPA_KERNEL_BGQ             = 4
  integer, parameter :: COMPLEX_ELPA_KERNEL_SSE             = 5
  integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK1      = 6
  integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK2      = 7

#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
144
  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
145
#else
146
  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
147
148
#endif
  character(35), parameter, dimension(number_of_complex_kernels) :: &
149
150
151
152
153
154
155
  COMPLEX_ELPA_KERNEL_NAMES = (/"COMPLEX_ELPA_KERNEL_GENERIC         ", &
                                "COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE  ", &
                                "COMPLEX_ELPA_KERNEL_BGP             ", &
                                "COMPLEX_ELPA_KERNEL_BGQ             ", &
                                "COMPLEX_ELPA_KERNEL_SSE             ", &
                                "COMPLEX_ELPA_KERNEL_AVX_BLOCK1      ", &
                                "COMPLEX_ELPA_KERNEL_AVX_BLOCK2      "/)
Andreas Marek's avatar
Andreas Marek committed
156
157
158
159
160
161
162

  integer, parameter                                    ::             &
           AVAILABLE_REAL_ELPA_KERNELS(number_of_real_kernels) =       &
                                      (/                               &
#if WITH_REAL_GENERIC_KERNEL
                                        1                              &
#else
163
                                        0                              &
Andreas Marek's avatar
Andreas Marek committed
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#endif
#if WITH_REAL_GENERIC_SIMPLE_KERNEL
                                          ,1                           &
#else
                                          ,0                           &
#endif
#if WITH_REAL_BGP_KERNEL
                                            ,1                         &
#else
                                            ,0                         &
#endif
#if WITH_REAL_BGQ_KERNEL
                                              ,1                       &
#else
                                              ,0                       &
#endif
#if WITH_REAL_SSE_KERNEL
                                                ,1                     &
#else
                                                ,0                     &
#endif
#if WITH_REAL_AVX_BLOCK2_KERNEL
                                                  ,1                   &
#else
                                                  ,0                   &
#endif
#if WITH_REAL_AVX_BLOCK4_KERNEL
                                                    ,1                 &
#else
                                                    ,0                 &
#endif
#if WITH_REAL_AVX_BLOCK6_KERNEL
                                                      ,1               &
#else
                                                      ,0               &
#endif
                                                       /)

  integer, parameter ::                                                   &
           AVAILABLE_COMPLEX_ELPA_KERNELS(number_of_complex_kernels) =    &
                                      (/                                  &
#if WITH_COMPLEX_GENERIC_KERNEL
                                        1                                 &
#else
                                        0                                 &
#endif
#if WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
                                          ,1                              &
#else
                                          ,0                              &
#endif
#if WITH_COMPLEX_BGP_KERNEL
                                            ,1                            &
#else
                                            ,0                            &
#endif
#if WITH_COMPLEX_BGQ_KERNEL
                                              ,1                          &
#else
                                              ,0                          &
#endif
#if WITH_COMPLEX_SSE_KERNEL
                                                ,1                        &
#else
                                                ,0                        &
#endif
230
#if WITH_COMPLEX_AVX_BLOCK1_KERNEL
Andreas Marek's avatar
Andreas Marek committed
231
232
233
234
                                                  ,1                      &
#else
                                                  ,0                      &
#endif
235
#if WITH_COMPLEX_AVX_BLOCK2_KERNEL
Andreas Marek's avatar
Andreas Marek committed
236
237
238
239
240
                                                    ,1                    &
#else
                                                    ,0                    &
#endif
                                                   /)
241
242
243
244
245
246
247

  public :: band_band_real
  public :: divide_band

  integer, public :: which_qr_decomposition = 1     ! defines, which QR-decomposition algorithm will be used
                                                    ! 0 for unblocked
                                                    ! 1 for blocked (maxrank: nblk)
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
!-------------------------------------------------------------------------------

  ! The following array contains the Householder vectors of the
  ! transformation band -> tridiagonal.
  ! It is allocated and set in tridiag_band_real and used in
  ! trans_ev_tridi_to_band_real.
  ! It must be deallocated by the user after trans_ev_tridi_to_band_real!

  real*8, allocatable :: hh_trans_real(:,:)
  complex*16, allocatable :: hh_trans_complex(:,:)

!-------------------------------------------------------------------------------

  include 'mpif.h'


!******
contains
266

Andreas Marek's avatar
Andreas Marek committed
267
subroutine print_available_real_kernels
Andreas Marek's avatar
Andreas Marek committed
268
269
270
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
Andreas Marek's avatar
Andreas Marek committed
271
272
273
  implicit none

  integer :: i
274

Andreas Marek's avatar
Andreas Marek committed
275
276
277
278
#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("print_available_real_kernels")
#endif

Andreas Marek's avatar
Andreas Marek committed
279
  do i=1, number_of_real_kernels
280
281
282
    if (AVAILABLE_REAL_ELPA_KERNELS(i) .eq. 1) then
      write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
    endif
Andreas Marek's avatar
Andreas Marek committed
283
284
285
286
287
  enddo
  write(error_unit,*) " "
  write(error_unit,*) " At the moment the following kernel would be choosen:"
  write(error_unit,*) get_actual_real_kernel_name()

Andreas Marek's avatar
Andreas Marek committed
288
289
290
#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("print_available_real_kernels")
#endif
Andreas Marek's avatar
Andreas Marek committed
291
292
293
294

end subroutine print_available_real_kernels

subroutine print_available_complex_kernels
Andreas Marek's avatar
Andreas Marek committed
295
296
297
#ifdef HAVE_DETAILED_TIMINGS
  use timings
#endif
Andreas Marek's avatar
Andreas Marek committed
298
299
300
301

  implicit none

  integer :: i
Andreas Marek's avatar
Andreas Marek committed
302
303
304
#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("print_available_complex_kernels")
#endif
305

306
  do i=1, number_of_complex_kernels
307
308
309
    if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .eq. 1) then
       write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
    endif
Andreas Marek's avatar
Andreas Marek committed
310
311
312
  enddo
  write(error_unit,*) " "
  write(error_unit,*) " At the moment the following kernel would be choosen:"
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
313
  write(error_unit,*) get_actual_complex_kernel_name()
Andreas Marek's avatar
Andreas Marek committed
314

Andreas Marek's avatar
Andreas Marek committed
315
316
317
#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("print_available_complex_kernels")
#endif
Andreas Marek's avatar
Andreas Marek committed
318
319
320

end subroutine print_available_complex_kernels

321
function get_actual_real_kernel() result(actual_kernel)
Andreas Marek's avatar
Andreas Marek committed
322
323
324
325
#ifdef HAVE_DETAILED_TIMINGS
  use timings
#endif
  implicit none
Andreas Marek's avatar
Andreas Marek committed
326
327

  integer :: actual_kernel
328

Andreas Marek's avatar
Andreas Marek committed
329
330
331
332
333
#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("get_actual_real_kernel")
#endif


Andreas Marek's avatar
Andreas Marek committed
334
335
336
  ! if kernel is not choosen via api
  ! check whether set by environment variable
  actual_kernel = real_kernel_via_environment_variable()
337

Andreas Marek's avatar
Andreas Marek committed
338
  if (actual_kernel .eq. 0) then
339
340
    ! if not then set default kernel
    actual_kernel = DEFAULT_REAL_ELPA_KERNEL
Andreas Marek's avatar
Andreas Marek committed
341
  endif
Andreas Marek's avatar
Andreas Marek committed
342
343
344
345
346

#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("get_actual_real_kernel")
#endif

Andreas Marek's avatar
Andreas Marek committed
347
348
end function get_actual_real_kernel

349
function get_actual_real_kernel_name() result(actual_kernel_name)
Andreas Marek's avatar
Andreas Marek committed
350
351
352
353
#ifdef HAVE_DETAILED_TIMINGS
  use timings
#endif
  implicit none
Andreas Marek's avatar
Andreas Marek committed
354
355
356

  character(35) :: actual_kernel_name
  integer       :: actual_kernel
Andreas Marek's avatar
Andreas Marek committed
357
358
359
360
361

#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("get_actual_real_kernel_name")
#endif

Andreas Marek's avatar
Andreas Marek committed
362
363
  actual_kernel = get_actual_real_kernel()
  actual_kernel_name = REAL_ELPA_KERNEL_NAMES(actual_kernel)
Andreas Marek's avatar
Andreas Marek committed
364
365
366
367
368

#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("get_actual_real_kernel_name")
#endif

Andreas Marek's avatar
Andreas Marek committed
369
370
end function get_actual_real_kernel_name

371
function get_actual_complex_kernel() result(actual_kernel)
Andreas Marek's avatar
Andreas Marek committed
372
373
374
375
#ifdef HAVE_DETAILED_TIMINGS
  use timings
#endif
  implicit none
Andreas Marek's avatar
Andreas Marek committed
376
  integer :: actual_kernel
377

Andreas Marek's avatar
Andreas Marek committed
378
379
380
381
382
#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("get_actual_complex_kernel")
#endif


Andreas Marek's avatar
Andreas Marek committed
383
384
385
  ! if kernel is not choosen via api
  ! check whether set by environment variable
  actual_kernel = complex_kernel_via_environment_variable()
386

Andreas Marek's avatar
Andreas Marek committed
387
  if (actual_kernel .eq. 0) then
388
389
    ! if not then set default kernel
    actual_kernel = DEFAULT_COMPLEX_ELPA_KERNEL
Andreas Marek's avatar
Andreas Marek committed
390
  endif
Andreas Marek's avatar
Andreas Marek committed
391
392
393
394
395

#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("get_actual_complex_kernel")
#endif

Andreas Marek's avatar
Andreas Marek committed
396
397
end function get_actual_complex_kernel

398
function get_actual_complex_kernel_name() result(actual_kernel_name)
Andreas Marek's avatar
Andreas Marek committed
399
400
401
402
#ifdef HAVE_DETAILED_TIMINGS
  use timings
#endif
  implicit none
Andreas Marek's avatar
Andreas Marek committed
403
404
  character(35) :: actual_kernel_name
  integer       :: actual_kernel
Andreas Marek's avatar
Andreas Marek committed
405
406
407
408
409

#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("get_actual_complex_kernel_name")
#endif

Andreas Marek's avatar
Andreas Marek committed
410
411
  actual_kernel = get_actual_complex_kernel()
  actual_kernel_name = COMPLEX_ELPA_KERNEL_NAMES(actual_kernel)
Andreas Marek's avatar
Andreas Marek committed
412
413
414
415
416
417


#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("get_actual_complex_kernel_name")
#endif

Andreas Marek's avatar
Andreas Marek committed
418
419
420
end function get_actual_complex_kernel_name

function check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL) result(err)
Andreas Marek's avatar
Andreas Marek committed
421
422
423
#ifdef HAVE_DETAILED_TIMINGS
  use timings
#endif
Andreas Marek's avatar
Andreas Marek committed
424
425
  implicit none
  integer, intent(in) :: THIS_REAL_ELPA_KERNEL
426

Andreas Marek's avatar
Andreas Marek committed
427
428
  logical             :: err

Andreas Marek's avatar
Andreas Marek committed
429
430
431
#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("check_allowed_real_kernels")
#endif
Andreas Marek's avatar
Andreas Marek committed
432
433
434
435
  err = .false.

  if (AVAILABLE_REAL_ELPA_KERNELS(THIS_REAL_ELPA_KERNEL) .ne. 1) err=.true.

Andreas Marek's avatar
Andreas Marek committed
436
437
438
439
#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("check_allowed_real_kernels")
#endif

Andreas Marek's avatar
Andreas Marek committed
440
441
442
end function check_allowed_real_kernels

function check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL) result(err)
Andreas Marek's avatar
Andreas Marek committed
443
444
445
#ifdef HAVE_DETAILED_TIMINGS
  use timings
#endif
Andreas Marek's avatar
Andreas Marek committed
446
447
448
449
  implicit none
  integer, intent(in) :: THIS_COMPLEX_ELPA_KERNEL

  logical             :: err
Andreas Marek's avatar
Andreas Marek committed
450
451
452
#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("check_allowed_complex_kernels")
#endif
Andreas Marek's avatar
Andreas Marek committed
453
454
455
  err = .false.

  if (AVAILABLE_COMPLEX_ELPA_KERNELS(THIS_COMPLEX_ELPA_KERNEL) .ne. 1) err=.true.
Andreas Marek's avatar
Andreas Marek committed
456
457
458
459
460

#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("check_allowed_complex_kernels")
#endif

Andreas Marek's avatar
Andreas Marek committed
461
462
end function check_allowed_complex_kernels

463
function qr_decomposition_via_environment_variable(useQR) result(isSet)
Andreas Marek's avatar
Andreas Marek committed
464
465
466
#ifdef HAVE_DETAILED_TIMINGS
  use timings
#endif
467
468
469
470
471
  implicit none
  logical, intent(out) :: useQR
  logical              :: isSet
  CHARACTER(len=255)   :: ELPA_QR_DECOMPOSITION

Andreas Marek's avatar
Andreas Marek committed
472
473
474
475
#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("qr_decomposition_via_environment_variable")
#endif

476
477
478
479
480
481
482
483
484
485
486
487
488
489
  isSet = .false.

#if defined(HAVE_ENVIRONMENT_CHECKING)
  call get_environment_variable("ELPA_QR_DECOMPOSITION",ELPA_QR_DECOMPOSITION)
#endif
  if (trim(ELPA_QR_DECOMPOSITION) .eq. "yes") then
    useQR = .true.
    isSet = .true.
  endif
  if (trim(ELPA_QR_DECOMPOSITION) .eq. "no") then
    useQR = .false.
    isSet = .true.
  endif

Andreas Marek's avatar
Andreas Marek committed
490
491
492
493
#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("qr_decomposition_via_environment_variable")
#endif

494
495
496
end function qr_decomposition_via_environment_variable


497
function real_kernel_via_environment_variable() result(kernel)
Andreas Marek's avatar
Andreas Marek committed
498
499
500
#ifdef HAVE_DETAILED_TIMINGS
  use timings
#endif
Andreas Marek's avatar
Andreas Marek committed
501
502
503
504
505
  implicit none
  integer :: kernel
  CHARACTER(len=255) :: REAL_KERNEL_ENVIRONMENT
  integer :: i

Andreas Marek's avatar
Andreas Marek committed
506
507
508
509
#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("real_kernel_via_environment_variable")
#endif

Andreas Marek's avatar
Andreas Marek committed
510
511
512
513
514
#if defined(HAVE_ENVIRONMENT_CHECKING)
  call get_environment_variable("REAL_ELPA_KERNEL",REAL_KERNEL_ENVIRONMENT)
#endif
  do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
!     if (trim(dummy_char) .eq. trim(REAL_ELPA_KERNEL_NAMES(i))) then
515
516
517
518
519
520
    if (trim(REAL_KERNEL_ENVIRONMENT) .eq. trim(REAL_ELPA_KERNEL_NAMES(i))) then
      kernel = i
      exit
    else
      kernel = 0
    endif
Andreas Marek's avatar
Andreas Marek committed
521
  enddo
Andreas Marek's avatar
Andreas Marek committed
522
523
524
525
526

#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("real_kernel_via_environment_variable")
#endif

Andreas Marek's avatar
Andreas Marek committed
527
528
end function real_kernel_via_environment_variable

529
function complex_kernel_via_environment_variable() result(kernel)
Andreas Marek's avatar
Andreas Marek committed
530
531
532
#ifdef HAVE_DETAILED_TIMINGS
  use timings
#endif
Andreas Marek's avatar
Andreas Marek committed
533
534
535
536
537
  implicit none
  integer :: kernel

  CHARACTER(len=255) :: COMPLEX_KERNEL_ENVIRONMENT
  integer :: i
Andreas Marek's avatar
Andreas Marek committed
538
539
540
541
542

#ifdef HAVE_DETAILED_TIMINGS
  call timer%start("complex_kernel_via_environment_variable")
#endif

Andreas Marek's avatar
Andreas Marek committed
543
544
545
546
547
#if defined(HAVE_ENVIRONMENT_CHECKING)
  call get_environment_variable("COMPLEX_ELPA_KERNEL",COMPLEX_KERNEL_ENVIRONMENT)
#endif

  do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
548
549
550
551
552
553
    if (trim(COMPLEX_ELPA_KERNEL_NAMES(i)) .eq. trim(COMPLEX_KERNEL_ENVIRONMENT)) then
      kernel = i
      exit
    else
      kernel = 0
    endif
Andreas Marek's avatar
Andreas Marek committed
554
555
  enddo

Andreas Marek's avatar
Andreas Marek committed
556
557
558
559
560
#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("complex_kernel_via_environment_variable")
#endif


Andreas Marek's avatar
Andreas Marek committed
561
562
end function complex_kernel_via_environment_variable

563
564
565
566
function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk,        &
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
                                 useQR) result(success)
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601

!-------------------------------------------------------------------------------
!  solve_evp_real_2stage: Solves the real eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
!  a(lda,*)    Distributed matrix for which eigenvalues are to be computed.
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
!  q(ldq,*)    On output: Eigenvectors of a
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
602
603
604
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
605
   implicit none
606
607
   logical, intent(in), optional :: useQR
   logical                       :: useQRActual, useQREnvironment
Andreas Marek's avatar
Andreas Marek committed
608
   integer, intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
609
   integer                       :: THIS_REAL_ELPA_KERNEL
610

611
612
613
   integer, intent(in)           :: na, nev, lda, ldq, nblk, mpi_comm_rows, &
                                    mpi_comm_cols, mpi_comm_all
   real*8, intent(inout)         :: a(lda,*), ev(na), q(ldq,*)
614

615
616
617
618
619
620
   integer                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer                       :: nbw, num_blocks
   real*8, allocatable           :: tmat(:,:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
   logical                       :: success
621

622
623
624
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_real_2stage")
#endif
625
626
627
628
629
630
631
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
632

633
634
   success = .true.

635
636
637
638
639
640
641
642
643
644
645
646
647
648
   useQRActual = .false.

   ! set usage of qr decomposition via API call
   if (present(useQR)) then
     if (useQR) useQRActual = .true.
     if (useQR .and. na .lt. 800) useQRActual = .false.
     if (.not.(useQR)) useQRACtual = .false.
   endif

   ! overwrite this with environment variable settings
   if (qr_decomposition_via_environment_variable(useQREnvironment)) then
     useQRActual = useQREnvironment
   endif

649
650
651
652
653
   if (useQRActual) then
     print *,"Carefull you use the experimental feature QR-decomposition"
     print *,"it is possible that the results are wrong"
   endif

654
655
656
   if (present(THIS_REAL_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
657
   else
658

659
660
661
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
Andreas Marek's avatar
Andreas Marek committed
662
663
664
665
   endif

   ! check whether choosen kernel is allowed
   if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
666

667
668
669
670
671
672
673
674
675
676
677
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
678

679
680
681
682
       write(error_unit,*) " "
       write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !"
     endif
     THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
683
684

   endif
685
686
687
688
689
690
691
692
693
694
695
696
697

   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

   nbw = (31/nblk+1)*nblk

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
698
   call bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
699
                     tmat, success, useQRActual)
700
   if (.not.(success)) return
701
   ttt1 = MPI_Wtime()
702
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
703
      write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
704
705
706
707
708
709

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
710
711
   call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, mpi_comm_rows, &
                          mpi_comm_cols, mpi_comm_all)
712
   ttt1 = MPI_Wtime()
713
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
714
      write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
715
716
717
718
719
720
721
722
723
724

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
725
726
727
728
   call solve_tridi(na, nev, ev, e, q, ldq, nblk, mpi_comm_rows,  &
                    mpi_comm_cols, success)
   if (.not.(success)) return

729
   ttt1 = MPI_Wtime()
730
731
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
     write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
732
733
734
735
736
737
738
739
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   deallocate(e)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
740
741
742
   call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows, &
                                    mpi_comm_cols, success, THIS_REAL_ELPA_KERNEL)
   if (.not.(success)) return
743
   ttt1 = MPI_Wtime()
744
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
745
      write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
746
747
748
749
750
751
752

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_real)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
753
754
   call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, mpi_comm_rows, &
                                   mpi_comm_cols, useQRActual)
755
   ttt1 = MPI_Wtime()
756
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
757
      write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
758
759
760
   time_evp_back = ttt1-ttts

   deallocate(tmat)
761
762
763
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_real_2stage")
#endif
764
765
1  format(a,f10.3)

766
end function solve_evp_real_2stage
767
768
769
770
771

!-------------------------------------------------------------------------------

!-------------------------------------------------------------------------------

772
function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
Andreas Marek's avatar
Andreas Marek committed
773
                                    mpi_comm_rows, mpi_comm_cols,      &
774
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809

!-------------------------------------------------------------------------------
!  solve_evp_complex_2stage: Solves the complex eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
!  a(lda,*)    Distributed matrix for which eigenvalues are to be computed.
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
!  q(ldq,*)    On output: Eigenvectors of a
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
810
811
812
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
813
   implicit none
Andreas Marek's avatar
Andreas Marek committed
814
815
   integer, intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
   integer                       :: THIS_COMPLEX_ELPA_KERNEL
816
817
818
819
820
821
822
823
824
825
   integer, intent(in)           :: na, nev, lda, ldq, nblk, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
   complex*16, intent(inout)     :: a(lda,*), q(ldq,*)
   real*8, intent(inout)         :: ev(na)

   integer                       :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes
   integer                       :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
   complex*16, allocatable       :: tmat(:,:,:)
   real*8, allocatable           :: q_real(:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
826

827
   logical                       :: success
828
829
830
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_complex_2stage")
#endif
Andreas Marek's avatar
Andreas Marek committed
831
832
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
833
834
835
836
837

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
838
839
840

   success = .true.

841
842
843
   if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
844
   else
845
846
847
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel()
Andreas Marek's avatar
Andreas Marek committed
848
   endif
849

Andreas Marek's avatar
Andreas Marek committed
850
851
   ! check whether choosen kernel is allowed
   if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then
852

853
854
855
856
857
858
859
860
861
862
863
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
864

865
866
867
868
       write(error_unit,*) " "
       write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
     endif
     THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
869
870
!      call MPI_ABORT(mpi_comm_all, mpierr)
   endif
871
872
873
874
875
876
877
878
879
880
881
882
   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

   nbw = (31/nblk+1)*nblk

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
883
884
   call bandred_complex(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
                        tmat, success)
885
886
887
888
889
890
   if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
     call timer%stop()
#endif
     return
   endif
891
   ttt1 = MPI_Wtime()
892
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
893
      write(error_unit,*) 'Time bandred_complex               :',ttt1-ttt0
894
895
896
897
898
899
900
901

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
   call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
   ttt1 = MPI_Wtime()
902
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
903
      write(error_unit,*) 'Time tridiag_band_complex          :',ttt1-ttt0
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
   l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev

   allocate(q_real(l_rows,l_cols))

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
920
921
922
923
   call solve_tridi(na, nev, ev, e, q_real, ubound(q_real,1), nblk, &
                    mpi_comm_rows, mpi_comm_cols, success)
   if (.not.(success)) return

924
   ttt1 = MPI_Wtime()
925
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times)  &
926
      write(error_unit,*) 'Time solve_tridi                   :',ttt1-ttt0
927
928
929
930
931
932
933
934
935
936
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)

   deallocate(e, q_real)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
937
   call trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq,  &
938
939
940
                                       mpi_comm_rows, mpi_comm_cols,&
                                       success,THIS_COMPLEX_ELPA_KERNEL)
   if (.not.(success)) return
941
   ttt1 = MPI_Wtime()
942
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
943
      write(error_unit,*) 'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0
944
945
946
947
948
949
950
951
952

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_complex)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
   call trans_ev_band_to_full_complex(na, nev, nblk, nbw, a, lda, tmat, q, ldq, mpi_comm_rows, mpi_comm_cols)
   ttt1 = MPI_Wtime()
953
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
954
      write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0
955
956
957
   time_evp_back = ttt1-ttts

   deallocate(tmat)
958
959
960
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_complex_2stage")
#endif
961
962
963

1  format(a,f10.3)

964
end function solve_evp_complex_2stage
965
966
967

!-------------------------------------------------------------------------------

968
subroutine bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
969
                        tmat, success, useQR)
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997

!-------------------------------------------------------------------------------
!  bandred_real: Reduces a distributed symmetric matrix to band form
!
!  Parameters
!
!  na          Order of matrix
!
!  a(lda,*)    Distributed matrix which should be reduced.
!              Distribution is like in Scalapack.
!              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
!              a(:,:) is overwritten on exit with the band and the Householder vectors
!              in the upper half.
!
!  lda         Leading dimension of a
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  nbw         semi bandwith of output matrix
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!
!  tmat(nbw,nbw,num_blocks)    where num_blocks = (na-1)/nbw + 1
!              Factors for the Householder vectors (returned), needed for back transformation
!
!-------------------------------------------------------------------------------
998
999
1000
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
1001
1002
   implicit none

1003
1004
   integer             :: na, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols
   real*8              :: a(lda,*), tmat(nbw,nbw,*)
1005

1006
1007
1008
1009
1010
   integer             :: my_prow, my_pcol, np_rows, np_cols, mpierr
   integer             :: l_cols, l_rows
   integer             :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
   integer             :: istep, ncol, lch, lcx, nlc
   integer             :: tile_size, l_rows_tile, l_cols_tile
1011

1012
   real*8              :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
1013

1014
   real*8, allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:)
1015

1016
   integer             :: pcol, prow
1017
1018
1019
1020
1021
1022

   ! needed for blocked QR decomposition
   integer             :: PQRPARAM(11), work_size
   real*8              :: dwork_size(1)
   real*8, allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)

1023
1024
1025
   pcol(i) = MOD((i-1)/nblk,np_cols) !Processor col for global col number
   prow(i) = MOD((i-1)/nblk,np_rows) !Processor row for global row number

1026
1027
   logical, intent(out):: success

1028
1029
   logical, intent(in) :: useQR

1030
1031
1032
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("bandred_real")
#endif
1033
1034
1035
1036
   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
1037
   success = .true.
1038
1039
1040

   ! Semibandwith nbw must be a multiple of blocksize nblk

1041
1042
1043
1044
1045
   if (mod(nbw,nblk)/=0) then
     if (my_prow==0 .and. my_pcol==0) then
       write(error_unit,*) 'ERROR: nbw=',nbw,', nblk=',nblk
       write(error_unit,*) 'ELPA2 works only for nbw==n*nblk'
       success = .false.
1046
!         call mpi_abort(mpi_comm_world,0,mpierr)
1047
     endif
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
   endif

   ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

   tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
   tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

   l_rows_tile = tile_size/np_rows ! local rows of a tile
   l_cols_tile = tile_size/np_cols ! local cols of a tile

1058
1059
1060
1061
1062
1063
1064
   if (useQR) then
     if (which_qr_decomposition == 1) then
       call qr_pqrparam_init(pqrparam,    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
       allocate(tauvector(na))
       allocate(blockheuristic(nblk))
       l_rows = local_index(na, my_prow, np_rows, nblk, -1)
       allocate(vmr(max(l_rows,1),na))
1065

1066
       call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), tmat(1,1,1), nbw, dwork_size(1), -1, na, &
1067
                             nbw, nblk, nblk, na, na, 1, 0, PQRPARAM, mpi_comm_rows, mpi_comm_cols, blockheuristic)
1068
1069
       work_size = dwork_size(1)
       allocate(work_blocked(work_size))
1070

1071
1072
1073
       work_blocked = 0.0d0
       deallocate(vmr)
     endif
1074
1075
   endif

1076
1077
   do istep = (na-1)/nbw, 1, -1

1078
     n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
1079

1080
1081
1082
     ! Number of local columns/rows of remaining matrix
     l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
     l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
1083

1084
     ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
1085

1086
1087
     allocate(vmr(max(l_rows,1),2*n_cols))
     allocate(umc(max(l_cols,1),2*n_cols))
1088

1089
     allocate(vr(l_rows+1))
1090

1091
1092
1093
     vmr(1:l_rows,1:n_cols) = 0.
     vr(:) = 0
     tmat(:,:,istep) = 0
1094

1095
     ! Reduce current block to lower triangular form
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105

     if (useQR) then
       if (which_qr_decomposition == 1) then
         call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), &
                                  tmat(1,1,istep), nbw, work_blocked,       &
                                  work_size, na, n_cols, nblk, nblk,        &
                                  istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                  0, PQRPARAM, mpi_comm_rows, mpi_comm_cols,&
                                  blockheuristic)
       endif
1106
     else
1107

1108
       do lc = n_cols, 1, -1
1109

1110
1111
         ncol = istep*nbw + lc ! absolute column number of householder vector
         nrow = ncol - nbw ! Absolute number of pivot row
1112

1113
1114
         lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
         lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
1115

1116
         tau = 0
1117

1118
         if (nrow == 1) exit ! Nothing to do
1119

1120
         cur_pcol = pcol(ncol) ! Processor column owning current block
1121

1122
         if (my_pcol==cur_pcol) then
1123

1124
1125
           ! Get vector to be transformed; distribute last element and norm of
           ! remaining elements to all procs in current column
1126

1127
           vr(1:lr) = a(1:lr,lch) ! vector to be transformed
1128

1129
1130
1131
1132
1133
1134
1135
           if (my_prow==prow(nrow)) then
             aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
             aux1(2) = vr(lr)
           else
             aux1(1) = dot_product(vr(1:lr),vr(1:lr))
             aux1(2) = 0.
           endif
1136

1137
           call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
1138

1139
1140
           vnorm2 = aux2(1)
           vrl    = aux2(2)
1141

1142
           ! Householder transformation
1143

1144
           call hh_transform_real(vrl, vnorm2, xf, tau)
1145

1146
           ! Scale vr and store Householder vector for back transformation
1147

1148
1149
1150
1151
1152
1153
1154
           vr(1:lr) = vr(1:lr) * xf
           if (my_prow==prow(nrow)) then
             a(1:lr-1,lch) = vr(1:lr-1)
             a(lr,lch) = vrl
             vr(lr) = 1.
           else
             a(1:lr,lch) = vr(1:lr)
1155
           endif
1156

1157
         endif
1158

1159
         ! Broadcast Householder vector and tau along columns
1160

1161
1162
1163
1164
1165
         vr(lr+1) = tau
         call MPI_Bcast(vr,lr+1,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
         vmr(1:lr,lc) = vr(1:lr)
         tau = vr(lr+1)
         tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
1166

1167
1168
         ! Transform remaining columns in current block with Householder vector
         ! Local dot product
1169

1170
         aux1 = 0
1171

1172
1173
1174
1175
1176
1177
1178
1179
         nlc = 0 ! number of local columns
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             nlc = nlc+1
             if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
           endif
         enddo
1180

1181
1182
         ! Get global dot products
         if (nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
1183

1184
         ! Transform
1185

1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
         nlc = 0
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             nlc = nlc+1
             a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
           endif
         enddo

       enddo
1196

1197
1198
       ! Calculate scalar products of stored Householder vectors.
       ! This can be done in different ways, we use dsyrk
1199

1200
1201
       vav = 0
       if (l_rows>0) &
1202
           call dsyrk('U','T',n_cols,l_rows,1.d0,vmr,ubound(vmr,1),0.d0,vav,ubound(vav,1))
1203
       call symm_matrix_allreduce(n_cols,vav,ubound(vav,1),mpi_comm_rows)
1204

1205
       ! Calculate triangular matrix T for block Householder Transformation
1206

1207
1208
1209
1210
1211
1212
1213
       do lc=n_cols,1,-1
         tau = tmat(lc,lc,istep)
         if (lc<n_cols) then
           call dtrmv('U','T','N',n_cols-lc,tmat(lc+1,lc+1,istep),ubound(tmat,1),vav(lc+1,lc),1)
           tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
         endif
       enddo
1214
     endif
1215

1216
    ! Transpose vmr -> vmc (stored in umc, second half)
1217

1218
    call elpa_transpose_vectors  (vmr, ubound(vmr,1), mpi_comm_rows, &
1219
1220
1221
                                    umc(1,n_cols+1), ubound(umc,1), mpi_comm_cols, &
                                    1, istep*nbw, n_cols, nblk)

1222
1223
1224
1225
    ! Calculate umc = A**T * vmr
    ! Note that the distributed A has to be transposed
    ! Opposed to direct tridiagonalization there is no need to use the cache locality
    ! of the tiles, so we can use strips of the matrix
1226

1227
1228
1229
1230
    umc(1:l_cols,1:n_cols) = 0.d0
    vmr(1:l_rows,n_cols+1:2*n_cols) = 0
    if (l_cols>0 .and. l_rows>0) then
      do i=0,(istep*nbw-1)/tile_size
1231

1232
1233
1234
        lcs = i*l_cols_tile+1
        lce = min(l_cols,(i+1)*l_cols_tile)
        if (lce<lcs) cycle
1235

1236
1237
1238
        lre = min(l_rows,(i+1)*l_rows_tile)
        call DGEMM('T','N',lce-lcs+1,n_cols,lre,1.d0,a(1,lcs),ubound(a,1), &
                     vmr,ubound(vmr,1),1.d0,umc(lcs,1),ubound(umc,1))
1239

1240
1241
1242
1243
1244
1245
        if (i==0) cycle
        lre = min(l_rows,i*l_rows_tile)
        call DGEMM('N','N',lre,n_cols,lce-lcs+1,1.d0,a(1,lcs),lda, &
                     umc(lcs,n_cols+1),ubound(umc,1),1.d0,vmr(1,n_cols+1),ubound(vmr,1))
      enddo
    endif
1246

1247
1248
1249
1250
    ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
    ! on the processors containing the diagonal
    ! This is only necessary if ur has been calculated, i.e. if the
    ! global tile size is smaller than the global remaining matrix
1251