elpa2.F90 211 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
17
!    This particular source code file contains additions, changes and
Andreas Marek's avatar
Andreas Marek committed
18
!    enhancements authored by Intel Corporation which is not part of
19
!    the ELPA consortium.
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
!
!    More information can be found here:
!    http://elpa.rzg.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".


#include "config-f90.h"

module ELPA2

! Version 1.1.2, 2011-02-21

68
  use elpa_utilities
69
  USE ELPA1
70
  use elpa2_utilities
71
72
  use elpa_pdgeqrf

73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
  implicit none

  PRIVATE ! By default, all routines contained are private

  ! The following routines are public:

  public :: solve_evp_real_2stage
  public :: solve_evp_complex_2stage

  public :: bandred_real
  public :: tridiag_band_real
  public :: trans_ev_tridi_to_band_real
  public :: trans_ev_band_to_full_real

  public :: bandred_complex
  public :: tridiag_band_complex
  public :: trans_ev_tridi_to_band_complex
  public :: trans_ev_band_to_full_complex
91

92
93
94
95
96
97
  public :: band_band_real
  public :: divide_band

  integer, public :: which_qr_decomposition = 1     ! defines, which QR-decomposition algorithm will be used
                                                    ! 0 for unblocked
                                                    ! 1 for blocked (maxrank: nblk)
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
!-------------------------------------------------------------------------------

  ! The following array contains the Householder vectors of the
  ! transformation band -> tridiagonal.
  ! It is allocated and set in tridiag_band_real and used in
  ! trans_ev_tridi_to_band_real.
  ! It must be deallocated by the user after trans_ev_tridi_to_band_real!

  real*8, allocatable :: hh_trans_real(:,:)
  complex*16, allocatable :: hh_trans_complex(:,:)

!-------------------------------------------------------------------------------

  include 'mpif.h'


!******
contains
116

117
function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk,        &
118
                               matrixCols,                               &
119
120
121
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
                                 useQR) result(success)
122
123
124
125
126
127
128
129
130
131

!-------------------------------------------------------------------------------
!  solve_evp_real_2stage: Solves the real eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
132
!  a(lda,matrixCols)    Distributed matrix for which eigenvalues are to be computed.
133
134
135
136
137
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
138
!  matrixCols  local columns of matrix a and q
139
140
141
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
142
!  q(ldq,matrixCols)    On output: Eigenvectors of a
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
158
159
160
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
161
   implicit none
162
163
   logical, intent(in), optional :: useQR
   logical                       :: useQRActual, useQREnvironment
Andreas Marek's avatar
Andreas Marek committed
164
   integer, intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
165
   integer                       :: THIS_REAL_ELPA_KERNEL
166

167
   integer, intent(in)           :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, &
168
                                    mpi_comm_cols, mpi_comm_all
169
   integer, intent(in)           :: nblk
170
   real*8, intent(inout)         :: a(lda,matrixCols), ev(na), q(ldq,matrixCols)
171

172
173
174
175
176
177
   integer                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer                       :: nbw, num_blocks
   real*8, allocatable           :: tmat(:,:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
   logical                       :: success
178
179
   logical, save                 :: firstCall = .true.
   logical                       :: wantDebug
180

181
182
183
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_real_2stage")
#endif
184
185
186
187
188
189
190
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
191

192
193
194
195
196
197
198
199

   wantDebug = .false.
   if (firstCall) then
     ! are debug messages desired?
     wantDebug = debug_messages_via_environment_variable()
     firstCall = .false.
   endif

200
201
   success = .true.

202
203
204
205
206
207
208
209
210
211
212
213
214
   useQRActual = .false.

   ! set usage of qr decomposition via API call
   if (present(useQR)) then
     if (useQR) useQRActual = .true.
     if (.not.(useQR)) useQRACtual = .false.
   endif

   ! overwrite this with environment variable settings
   if (qr_decomposition_via_environment_variable(useQREnvironment)) then
     useQRActual = useQREnvironment
   endif

215
   if (useQRActual) then
216
217
218
219
     if (mod(na,nblk) .ne. 0) then
       if (wantDebug) then
         write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
       endif
Andreas Marek's avatar
Andreas Marek committed
220
     print *, "Do not use QR-decomposition for this matrix and blocksize."
Andreas Marek's avatar
Andreas Marek committed
221
222
     success = .false.
     return
223
     endif
224
225
   endif

226

227
228
229
   if (present(THIS_REAL_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
230
   else
231

232
233
234
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
Andreas Marek's avatar
Andreas Marek committed
235
236
237
238
   endif

   ! check whether choosen kernel is allowed
   if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
239

240
241
242
243
244
245
246
247
248
249
250
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
251

252
253
254
255
       write(error_unit,*) " "
       write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !"
     endif
     THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
256
257

   endif
258
259

   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
Andreas Marek's avatar
Andreas Marek committed
260
   ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
261
   ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
Andreas Marek's avatar
Andreas Marek committed
262
263
   ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
   ! on this and maybe allow a run-time optimization here
264
   nbw = (63/nblk+1)*nblk
265
266
267
268
269
270
271
272
273

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
274
   call bandred_real(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
275
                     tmat, wantDebug, success, useQRActual)
276
   if (.not.(success)) return
277
   ttt1 = MPI_Wtime()
278
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
279
      write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
280
281
282
283
284
285

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
286
   call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, matrixCols, mpi_comm_rows, &
Andreas Marek's avatar
Andreas Marek committed
287
                          mpi_comm_cols, mpi_comm_all)
288
   ttt1 = MPI_Wtime()
289
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
290
      write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
291
292
293
294
295
296
297
298
299
300

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
301
   call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
302
                    mpi_comm_cols, wantDebug, success)
303
304
   if (.not.(success)) return

305
   ttt1 = MPI_Wtime()
306
307
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
     write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
308
309
310
311
312
313
314
315
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   deallocate(e)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
316
   call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, matrixCols, mpi_comm_rows, &
317
                                    mpi_comm_cols, wantDebug, success, THIS_REAL_ELPA_KERNEL)
318
   if (.not.(success)) return
319
   ttt1 = MPI_Wtime()
320
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
321
      write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
322
323
324
325
326
327
328

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_real)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
329
   call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, &
330
                                   mpi_comm_cols, useQRActual)
331
   ttt1 = MPI_Wtime()
332
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
333
      write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
334
335
336
   time_evp_back = ttt1-ttts

   deallocate(tmat)
337
338
339
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_real_2stage")
#endif
340
341
1  format(a,f10.3)

342
end function solve_evp_real_2stage
343
344
345
346
347

!-------------------------------------------------------------------------------

!-------------------------------------------------------------------------------

348
function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
349
                                  matrixCols, mpi_comm_rows, mpi_comm_cols,      &
350
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
351
352
353
354
355
356
357
358
359
360

!-------------------------------------------------------------------------------
!  solve_evp_complex_2stage: Solves the complex eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
361
!  a(lda,matrixCols)    Distributed matrix for which eigenvalues are to be computed.
362
363
364
365
366
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
367
!  matrixCols  local columns of matrix a and q
368
369
370
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
371
!  q(ldq,matrixCols)    On output: Eigenvectors of a
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
387
388
389
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
390
   implicit none
Andreas Marek's avatar
Andreas Marek committed
391
392
   integer, intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
   integer                       :: THIS_COMPLEX_ELPA_KERNEL
393
394
   integer, intent(in)           :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
   complex*16, intent(inout)     :: a(lda,matrixCols), q(ldq,matrixCols)
395
396
397
398
399
400
401
402
   real*8, intent(inout)         :: ev(na)

   integer                       :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes
   integer                       :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
   complex*16, allocatable       :: tmat(:,:,:)
   real*8, allocatable           :: q_real(:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
403

404
405
406
   logical                       :: success, wantDebug
   logical, save                 :: firstCall = .true.

407
408
409
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_complex_2stage")
#endif
Andreas Marek's avatar
Andreas Marek committed
410
411
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
412
413
414
415
416

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
417

418
419
420
421
422
423
424
425
   wantDebug = .false.
   if (firstCall) then
     ! are debug messages desired?
     wantDebug = debug_messages_via_environment_variable()
     firstCall = .false.
   endif


426
427
   success = .true.

428
429
430
   if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
431
   else
432
433
434
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel()
Andreas Marek's avatar
Andreas Marek committed
435
   endif
436

Andreas Marek's avatar
Andreas Marek committed
437
438
   ! check whether choosen kernel is allowed
   if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then
439

440
441
442
443
444
445
446
447
448
449
450
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
451

452
453
454
455
       write(error_unit,*) " "
       write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
     endif
     THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
456
   endif
457
458
459
460
461
462
463
464
465
466
467
468
   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

   nbw = (31/nblk+1)*nblk

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
469
   call bandred_complex(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
470
                        tmat, wantDebug, success)
471
472
473
474
475
476
   if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
     call timer%stop()
#endif
     return
   endif
477
   ttt1 = MPI_Wtime()
478
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
479
      write(error_unit,*) 'Time bandred_complex               :',ttt1-ttt0
480
481
482
483
484
485

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
486
   call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
487
   ttt1 = MPI_Wtime()
488
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
489
      write(error_unit,*) 'Time tridiag_band_complex          :',ttt1-ttt0
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
   l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev

   allocate(q_real(l_rows,l_cols))

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
506
   call solve_tridi(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, &
507
                    mpi_comm_rows, mpi_comm_cols, wantDebug, success)
508
509
   if (.not.(success)) return

510
   ttt1 = MPI_Wtime()
511
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times)  &
512
      write(error_unit,*) 'Time solve_tridi                   :',ttt1-ttt0
513
514
515
516
517
518
519
520
521
522
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)

   deallocate(e, q_real)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
523
   call trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq,  &
524
                                       matrixCols, mpi_comm_rows, mpi_comm_cols,&
525
                                       wantDebug, success,THIS_COMPLEX_ELPA_KERNEL)
526
   if (.not.(success)) return
527
   ttt1 = MPI_Wtime()
528
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
529
      write(error_unit,*) 'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0
530
531
532
533
534
535
536

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_complex)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
537
538
   call trans_ev_band_to_full_complex(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, &
                                      mpi_comm_rows, mpi_comm_cols)
539
   ttt1 = MPI_Wtime()
540
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
541
      write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0
542
543
544
   time_evp_back = ttt1-ttts

   deallocate(tmat)
545
546
547
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_complex_2stage")
#endif
548
549
550

1  format(a,f10.3)

551
end function solve_evp_complex_2stage
552
553
554

!-------------------------------------------------------------------------------

555
subroutine bandred_real(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
556
                        tmat, wantDebug, success, useQR)
557
558
559
560
561
562
563
564

!-------------------------------------------------------------------------------
!  bandred_real: Reduces a distributed symmetric matrix to band form
!
!  Parameters
!
!  na          Order of matrix
!
565
!  a(lda,matrixCols)    Distributed matrix which should be reduced.
566
567
568
569
570
571
!              Distribution is like in Scalapack.
!              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
!              a(:,:) is overwritten on exit with the band and the Householder vectors
!              in the upper half.
!
!  lda         Leading dimension of a
572
!  matrixCols  local columns of matrix a
573
574
575
576
577
578
579
580
581
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  nbw         semi bandwith of output matrix
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!
582
!  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
583
584
585
!              Factors for the Householder vectors (returned), needed for back transformation
!
!-------------------------------------------------------------------------------
586
587
#ifdef HAVE_DETAILED_TIMINGS
 use timings
588
589
590
#endif
#ifdef WITH_OPENMP
   use omp_lib
591
#endif
592
   implicit none
593

594
595
   integer             :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
   real*8              :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
596

597
598
   integer             :: my_prow, my_pcol, np_rows, np_cols, mpierr
   integer             :: l_cols, l_rows
599
600
   integer             :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow
   integer             :: istep, ncol, lch, lcx, nlc, mynlc
601
   integer             :: tile_size, l_rows_tile, l_cols_tile
602

603
   real*8              :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
604

605
   real*8, allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:)
606

607
608
609
610
611
   ! needed for blocked QR decomposition
   integer             :: PQRPARAM(11), work_size
   real*8              :: dwork_size(1)
   real*8, allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)

612
   logical, intent(in) :: wantDebug
613
614
   logical, intent(out):: success

615
616
   logical, intent(in) :: useQR

617
618
   integer :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, ii, pp, transformChunkSize

619
620
621
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("bandred_real")
#endif
622
623
624
625
   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
626
   success = .true.
627
628


629
   ! Semibandwith nbw must be a multiple of blocksize nblk
630
631
   if (mod(nbw,nblk)/=0) then
     if (my_prow==0 .and. my_pcol==0) then
632
633
634
635
       if (wantDebug) then
         write(error_unit,*) 'ELPA2_bandred_real: ERROR: nbw=',nbw,', nblk=',nblk
         write(error_unit,*) 'ELPA2_bandred_real: ELPA2 works only for nbw==n*nblk'
       endif
636
       success = .false.
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
637
       return
638
     endif
639
640
641
642
643
644
645
646
647
648
   endif

   ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

   tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
   tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

   l_rows_tile = tile_size/np_rows ! local rows of a tile
   l_cols_tile = tile_size/np_cols ! local cols of a tile

649
650
651
652
653
654
655
   if (useQR) then
     if (which_qr_decomposition == 1) then
       call qr_pqrparam_init(pqrparam,    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
       allocate(tauvector(na))
       allocate(blockheuristic(nblk))
       l_rows = local_index(na, my_prow, np_rows, nblk, -1)
       allocate(vmr(max(l_rows,1),na))
656

657
       call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), tmat(1,1,1), nbw, dwork_size(1), -1, na, &
658
                             nbw, nblk, nblk, na, na, 1, 0, PQRPARAM, mpi_comm_rows, mpi_comm_cols, blockheuristic)
659
660
       work_size = dwork_size(1)
       allocate(work_blocked(work_size))
661

662
663
664
       work_blocked = 0.0d0
       deallocate(vmr)
     endif
665
666
   endif

667
668
   do istep = (na-1)/nbw, 1, -1

669
     n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
670

671
672
673
     ! Number of local columns/rows of remaining matrix
     l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
     l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
674

675
     ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
676

677
678
     allocate(vmr(max(l_rows,1),2*n_cols))
     allocate(umc(max(l_cols,1),2*n_cols))
679

680
     allocate(vr(l_rows+1))
681

682
683
684
     vmr(1:l_rows,1:n_cols) = 0.
     vr(:) = 0
     tmat(:,:,istep) = 0
685

686
     ! Reduce current block to lower triangular form
687
688
689
690
691
692
693
694
695
696

     if (useQR) then
       if (which_qr_decomposition == 1) then
         call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), &
                                  tmat(1,1,istep), nbw, work_blocked,       &
                                  work_size, na, n_cols, nblk, nblk,        &
                                  istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                  0, PQRPARAM, mpi_comm_rows, mpi_comm_cols,&
                                  blockheuristic)
       endif
697
     else
698

699
       do lc = n_cols, 1, -1
700

701
702
         ncol = istep*nbw + lc ! absolute column number of householder vector
         nrow = ncol - nbw ! Absolute number of pivot row
703

704
705
         lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
         lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
706

707
         tau = 0
708

709
         if (nrow == 1) exit ! Nothing to do
710

711
         cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
712

713
         if (my_pcol==cur_pcol) then
714

715
716
           ! Get vector to be transformed; distribute last element and norm of
           ! remaining elements to all procs in current column
717

718
           vr(1:lr) = a(1:lr,lch) ! vector to be transformed
719

720
           if (my_prow==prow(nrow, nblk, np_rows)) then
721
722
723
724
725
726
             aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
             aux1(2) = vr(lr)
           else
             aux1(1) = dot_product(vr(1:lr),vr(1:lr))
             aux1(2) = 0.
           endif
727

728
           call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
729

730
731
           vnorm2 = aux2(1)
           vrl    = aux2(2)
732

733
           ! Householder transformation
734

735
           call hh_transform_real(vrl, vnorm2, xf, tau)
736

737
           ! Scale vr and store Householder vector for back transformation
738

739
           vr(1:lr) = vr(1:lr) * xf
740
           if (my_prow==prow(nrow, nblk, np_rows)) then
741
742
743
744
745
             a(1:lr-1,lch) = vr(1:lr-1)
             a(lr,lch) = vrl
             vr(lr) = 1.
           else
             a(1:lr,lch) = vr(1:lr)
746
           endif
747

748
         endif
749

750
         ! Broadcast Householder vector and tau along columns
751

752
753
754
755
756
         vr(lr+1) = tau
         call MPI_Bcast(vr,lr+1,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
         vmr(1:lr,lc) = vr(1:lr)
         tau = vr(lr+1)
         tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
757

758
759
         ! Transform remaining columns in current block with Householder vector
         ! Local dot product
760

761
         aux1 = 0
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
#ifdef WITH_OPENMP
         !Open up one omp region to avoid paying openmp overhead.
         !This does not help performance due to the addition of two openmp barriers around the MPI call,
         !But in the future this may be beneficial if these barriers are replaced with a faster implementation

         !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
         mynlc = 0 ! number of local columns

         !This loop does not have independent iterations,
         !'mynlc' is incremented each iteration, and it is difficult to remove this dependency 
         !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
         !That is, a thread only executes the work associated with an iteration if its thread id is congruent to 
         !the iteration number modulo the number of threads
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0 ) then
             mynlc = mynlc+1
             if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
                 if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           endif
         enddo
         
         ! Get global dot products
         !$omp barrier
         !$omp single 
         if (mynlc>0) call mpi_allreduce(aux1,aux2,mynlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
         !$omp end single 
         !$omp barrier
791

792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
         ! Transform
         transformChunkSize=32
         mynlc = 0
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             mynlc = mynlc+1
             !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
             !However, for some reason this is slower than doing it manually, so it is parallelized as below.
             do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
                do pp = 1,transformChunkSize
                    if (pp + ii > lr) exit
                        a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
                enddo
             enddo
           endif
         enddo
         !$omp end parallel
#else
811
812
813
814
815
816
817
818
         nlc = 0 ! number of local columns
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             nlc = nlc+1
             if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
           endif
         enddo
819

820
821
         ! Get global dot products
         if (nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
822

823
         ! Transform
824

825
826
827
828
829
830
831
832
         nlc = 0
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             nlc = nlc+1
             a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
           endif
         enddo
833
#endif
834
       enddo
835

836
837
       ! Calculate scalar products of stored Householder vectors.
       ! This can be done in different ways, we use dsyrk
838

839
840
       vav = 0
       if (l_rows>0) &
841
           call dsyrk('U','T',n_cols,l_rows,1.d0,vmr,ubound(vmr,dim=1),0.d0,vav,ubound(vav,dim=1))
842
       call symm_matrix_allreduce(n_cols,vav, nbw, nbw,mpi_comm_rows)
843

844
       ! Calculate triangular matrix T for block Householder Transformation
845

846
847
848
       do lc=n_cols,1,-1
         tau = tmat(lc,lc,istep)
         if (lc<n_cols) then
849
           call dtrmv('U','T','N',n_cols-lc,tmat(lc+1,lc+1,istep),ubound(tmat,dim=1),vav(lc+1,lc),1)
850
851
852
           tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
         endif
       enddo
853
     endif
854

855
    ! Transpose vmr -> vmc (stored in umc, second half)
856

857
858
    call elpa_transpose_vectors_real  (vmr, ubound(vmr,dim=1), mpi_comm_rows, &
                                    umc(1,n_cols+1), ubound(umc,dim=1), mpi_comm_cols, &
859
860
                                    1, istep*nbw, n_cols, nblk)

861
862
863
864
    ! Calculate umc = A**T * vmr
    ! Note that the distributed A has to be transposed
    ! Opposed to direct tridiagonalization there is no need to use the cache locality
    ! of the tiles, so we can use strips of the matrix
865
    !Code for Algorithm 4
866

867
868
    n_way = 1
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
869
    n_way = omp_get_max_threads()
870
871
872
#endif
    !umc(1:l_cols,1:n_cols) = 0.d0
    !vmr(1:l_rows,n_cols+1:2*n_cols) = 0
Andreas Marek's avatar
Andreas Marek committed
873
#ifdef WITH_OPENMP
874
    !$omp parallel private( i,lcs,lce,lrs,lre)
Andreas Marek's avatar
Andreas Marek committed
875
#endif
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
    if(n_way > 1) then
        !$omp do
        do i=1,min(l_cols_tile, l_cols)
            umc(i,1:n_cols) = 0.d0
        enddo
        !$omp do
        do i=1,l_rows
            vmr(i,n_cols+1:2*n_cols) = 0.d0
        enddo
        if (l_cols>0 .and. l_rows>0) then

          !SYMM variant 4
          !Partitioned Matrix Expression:
          ! Ct = Atl Bt + Atr Bb
          ! Cb = Atr' Bt + Abl Bb
          !
          !Loop invariant:
          ! Ct = Atl Bt + Atr Bb
          !
          !Update:
          ! C1 = A10'B0 + A11B1 + A21 B2
          !
          !This algorithm chosen because in this algoirhtm, the loop around the dgemm calls
          !is easily parallelized, and regardless of choise of algorithm,
          !the startup cost for parallelizing the dgemms inside the loop is too great

          !$omp do schedule(static,1)
          do i=0,(istep*nbw-1)/tile_size
            lcs = i*l_cols_tile+1                   ! local column start
            lce = min(l_cols, (i+1)*l_cols_tile)    ! local column end

            lrs = i*l_rows_tile+1                   ! local row start
            lre = min(l_rows, (i+1)*l_rows_tile)    ! local row end

            !C1 += [A11 A12] [B1
            !                 B2]
            if( lre > lrs .and. l_cols > lcs ) then
            call DGEMM('N','N', lre-lrs+1, n_cols, l_cols-lcs+1,    &
Andreas Marek's avatar
Andreas Marek committed
914
915
916
                       1.d0, a(lrs,lcs), ubound(a,dim=1),           &
                             umc(lcs,n_cols+1), ubound(umc,dim=1),  &
                       0.d0, vmr(lrs,n_cols+1), ubound(vmr,dim=1))
917
918
919
920
921
            endif

            ! C1 += A10' B0
            if( lce > lcs .and. i > 0 ) then
            call DGEMM('T','N', lce-lcs+1, n_cols, lrs-1,           &
Andreas Marek's avatar
Andreas Marek committed
922
923
924
                       1.d0, a(1,lcs),   ubound(a,dim=1),           &
                             vmr(1,1),   ubound(vmr,dim=1),         &
                       0.d0, umc(lcs,1), ubound(umc,dim=1))
925
926
927
928
929
930
931
932
933
934
935
936
937
938
            endif
          enddo
        endif
    else
        umc(1:l_cols,1:n_cols) = 0.d0
        vmr(1:l_rows,n_cols+1:2*n_cols) = 0
        if (l_cols>0 .and. l_rows>0) then
          do i=0,(istep*nbw-1)/tile_size

            lcs = i*l_cols_tile+1
            lce = min(l_cols,(i+1)*l_cols_tile)
            if (lce<lcs) cycle

            lre = min(l_rows,(i+1)*l_rows_tile)
Andreas Marek's avatar
Andreas Marek committed
939
            call DGEMM('T','N',lce-lcs+1,n_cols,lre,1.d0,a(1,lcs),ubound(a,dim=1), &
940
                         vmr,ubound(vmr,dim=1),1.d0,umc(lcs,1),ubound(umc,dim=1))
941
942
943
944

            if (i==0) cycle
            lre = min(l_rows,i*l_rows_tile)
            call DGEMM('N','N',lre,n_cols,lce-lcs+1,1.d0,a(1,lcs),lda, &
Andreas Marek's avatar
Andreas Marek committed
945
                         umc(lcs,n_cols+1),ubound(umc,dim=1),1.d0,vmr(1,n_cols+1),ubound(vmr,dim=1))
946
947
          enddo
        endif
948
    endif
Andreas Marek's avatar
Andreas Marek committed
949
#ifdef WITH_OPENMP
950
    !$omp end parallel
Andreas Marek's avatar
Andreas Marek committed
951
#endif
952
953
954
955
    ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
    ! on the processors containing the diagonal
    ! This is only necessary if ur has been calculated, i.e. if the
    ! global tile size is smaller than the global remaining matrix
956
957
    ! Or if we used the Algorithm 4
    if (tile_size < istep*nbw .or. n_way > 1) then
958
959
960
    call elpa_reduce_add_vectors_real  (vmr(1,n_cols+1),ubound(vmr,dim=1),mpi_comm_rows, &
                                        umc, ubound(umc,dim=1), mpi_comm_cols, &
                                        istep*nbw, n_cols, nblk)
961
    endif
962

963
964
965
966
967
968
    if (l_cols>0) then
      allocate(tmp(l_cols,n_cols))
      call mpi_allreduce(umc,tmp,l_cols*n_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
      umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
      deallocate(tmp)
    endif
969

970
    ! U = U * Tmat**T
971

972
    call dtrmm('Right','Upper','Trans','Nonunit',l_cols,n_cols,1.d0,tmat(1,1,istep),ubound(tmat,dim=1),umc,ubound(umc,dim=1))
973

974
    ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
975

Andreas Marek's avatar
Andreas Marek committed
976
977
978
979
    call dgemm('T','N',n_cols,n_cols,l_cols,1.d0,umc,ubound(umc,dim=1),umc(1,n_cols+1), &
               ubound(umc,dim=1),0.d0,vav,ubound(vav,dim=1))
    call dtrmm('Right','Upper','Trans','Nonunit',n_cols,n_cols,1.d0,tmat(1,1,istep),    &
               ubound(tmat,dim=1),vav,ubound(vav,dim=1))
980

981
    call symm_matrix_allreduce(n_cols,vav, nbw, nbw ,mpi_comm_cols)
982

983
    ! U = U - 0.5 * V * VAV
Andreas Marek's avatar
Andreas Marek committed
984
985
    call dgemm('N','N',l_cols,n_cols,n_cols,-0.5d0,umc(1,n_cols+1),ubound(umc,dim=1),vav, &
                ubound(vav,dim=1),1.d0,umc,ubound(umc,dim=1))
986

987
    ! Transpose umc -> umr (stored in vmr, second half)
988

989
990
    call elpa_transpose_vectors_real  (umc, ubound(umc,dim=1), mpi_comm_cols, &
                                   vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, &
991
                                   1, istep*nbw, n_cols, nblk)
992

993
    ! A = A - V*U**T - U*V**T
994
995
996
997
998
999
1000
1001
#ifdef WITH_OPENMP
    !$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend  )
    n_threads = omp_get_num_threads()
    if(mod(n_threads, 2) == 0) then
        n_way = 2
    else
        n_way = 1
    endif
1002

1003
    m_way = n_threads / n_way
Andreas Marek's avatar
Andreas Marek committed
1004

1005
1006
    m_id = mod(omp_get_thread_num(),  m_way)
    n_id = omp_get_thread_num() / m_way
Andreas Marek's avatar
Andreas Marek committed
1007

1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
    do ii=n_id*tile_size,(istep*nbw-1),tile_size*n_way
      i = ii / tile_size
      lcs = i*l_cols_tile+1
      lce = min(l_cols,(i+1)*l_cols_tile)
      lre = min(l_rows,(i+1)*l_rows_tile)
      if (lce<lcs .or. lre<1) cycle

      !Figure out this thread's range
      work_per_thread = lre / m_way
      if (work_per_thread * m_way < lre) work_per_thread = work_per_thread + 1
      mystart = m_id * work_per_thread + 1
      myend   = mystart + work_per_thread - 1
      if( myend > lre ) myend = lre
      if( myend-mystart+1 < 1) cycle

      call dgemm('N','T',myend-mystart+1, lce-lcs+1, 2*n_cols, -1.d0, &
                  vmr(mystart, 1), ubound(vmr,1), umc(lcs,1), ubound(umc,1), &
                  1.d0,a(mystart,lcs),ubound(a,1))
    enddo
    !$omp end parallel
1028

Andreas Marek's avatar
Andreas Marek committed
1029
#else /* WITH_OPENMP */
1030
1031
1032
1033
1034
1035
    do i=0,(istep*nbw-1)/tile_size
      lcs = i*l_cols_tile+1
      lce = min(l_cols,(i+1)*l_cols_tile)
      lre = min(l_rows,(i+1)*l_rows_tile)
      if (lce<lcs .or. lre<1) cycle
      call dgemm('N','T',lre,lce-lcs+1,2*n_cols,-1.d0, &
1036
                  vmr,ubound(vmr,dim=1),umc(lcs,1),ubound(umc,dim=1), &
1037
1038
                  1.d0,a(1,lcs),lda)
    enddo
Andreas Marek's avatar
Andreas Marek committed
1039
#endif /* WITH_OPENMP */
1040
    deallocate(vmr, umc, vr)
1041

1042
  enddo
1043

1044
1045
1046
1047
1048
  if (useQR) then
    if (which_qr_decomposition == 1) then
      deallocate(work_blocked)
      deallocate(tauvector)
    endif
1049
  endif
1050

Andreas Marek's avatar
Andreas Marek committed
1051
1052
1053
#ifdef HAVE_DETAILED_TIMINGS
  call timer%stop("bandred_real")
#endif
1054
1055
1056
1057
end subroutine bandred_real

!-------------------------------------------------------------------------------

1058
subroutine symm_matrix_allreduce(n,a,lda,ldb,comm)
1059
1060
1061
1062
1063
1064

!-------------------------------------------------------------------------------
!  symm_matrix_allreduce: Does an mpi_allreduce for a symmetric matrix A.
!  On entry, only the upper half of A needs to be set
!  On exit, the complete matrix is set
!-------------------------------------------------------------------------------
Andreas Marek's avatar
Andreas Marek committed
1065
1066
1067
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
1068
   implicit none
1069
1070
   integer  :: n, lda, ldb, comm
   real*8   :: a(lda,ldb)