elpa2.F90 212 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
17
!    This particular source code file contains additions, changes and
Andreas Marek's avatar
Andreas Marek committed
18
!    enhancements authored by Intel Corporation which is not part of
19
!    the ELPA consortium.
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
!
!    More information can be found here:
!    http://elpa.rzg.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".


#include "config-f90.h"

module ELPA2

! Version 1.1.2, 2011-02-21

68
  use elpa_utilities
69
70
  USE elpa1_compute
  use elpa1, only : elpa_print_times, time_evp_back, time_evp_fwd, time_evp_solve
71
  use elpa2_utilities
72
73
  use elpa_pdgeqrf

74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
  implicit none

  PRIVATE ! By default, all routines contained are private

  ! The following routines are public:

  public :: solve_evp_real_2stage
  public :: solve_evp_complex_2stage

  public :: bandred_real
  public :: tridiag_band_real
  public :: trans_ev_tridi_to_band_real
  public :: trans_ev_band_to_full_real

  public :: bandred_complex
  public :: tridiag_band_complex
  public :: trans_ev_tridi_to_band_complex
  public :: trans_ev_band_to_full_complex
92

93
94
95
96
97
98
  public :: band_band_real
  public :: divide_band

  integer, public :: which_qr_decomposition = 1     ! defines, which QR-decomposition algorithm will be used
                                                    ! 0 for unblocked
                                                    ! 1 for blocked (maxrank: nblk)
99
100
101
102
103
104
105
106
!-------------------------------------------------------------------------------

  ! The following array contains the Householder vectors of the
  ! transformation band -> tridiagonal.
  ! It is allocated and set in tridiag_band_real and used in
  ! trans_ev_tridi_to_band_real.
  ! It must be deallocated by the user after trans_ev_tridi_to_band_real!

107
108
!  real*8, allocatable :: hh_trans_real(:,:)
!  complex*16, allocatable :: hh_trans_complex(:,:)
109
110
111
112
113
114
115
116

!-------------------------------------------------------------------------------

  include 'mpif.h'


!******
contains
117

118
function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk,        &
119
                               matrixCols,                               &
120
121
122
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
                                 useQR) result(success)
123
124
125
126
127
128
129
130
131
132

!-------------------------------------------------------------------------------
!  solve_evp_real_2stage: Solves the real eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
133
!  a(lda,matrixCols)    Distributed matrix for which eigenvalues are to be computed.
134
135
136
137
138
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
139
!  matrixCols  local columns of matrix a and q
140
141
142
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
143
!  q(ldq,matrixCols)    On output: Eigenvectors of a
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
159
160
161
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
162
   implicit none
163
164
   logical, intent(in), optional :: useQR
   logical                       :: useQRActual, useQREnvironment
Andreas Marek's avatar
Andreas Marek committed
165
   integer, intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
166
   integer                       :: THIS_REAL_ELPA_KERNEL
167

168
   integer, intent(in)           :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, &
169
                                    mpi_comm_cols, mpi_comm_all
170
   integer, intent(in)           :: nblk
171
   real*8, intent(inout)         :: a(lda,matrixCols), ev(na), q(ldq,matrixCols)
172
   real*8, allocatable           :: hh_trans_real(:,:)
173

174
175
176
177
178
179
   integer                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer                       :: nbw, num_blocks
   real*8, allocatable           :: tmat(:,:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
   logical                       :: success
180
181
   logical, save                 :: firstCall = .true.
   logical                       :: wantDebug
182

183
184
185
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_real_2stage")
#endif
186
187
188
189
190
191
192
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
193

194
195
196
197
198
199
200
201

   wantDebug = .false.
   if (firstCall) then
     ! are debug messages desired?
     wantDebug = debug_messages_via_environment_variable()
     firstCall = .false.
   endif

202
203
   success = .true.

204
205
206
207
208
209
210
211
212
213
214
215
216
   useQRActual = .false.

   ! set usage of qr decomposition via API call
   if (present(useQR)) then
     if (useQR) useQRActual = .true.
     if (.not.(useQR)) useQRACtual = .false.
   endif

   ! overwrite this with environment variable settings
   if (qr_decomposition_via_environment_variable(useQREnvironment)) then
     useQRActual = useQREnvironment
   endif

217
   if (useQRActual) then
218
219
220
221
     if (mod(na,nblk) .ne. 0) then
       if (wantDebug) then
         write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
       endif
Andreas Marek's avatar
Andreas Marek committed
222
     print *, "Do not use QR-decomposition for this matrix and blocksize."
Andreas Marek's avatar
Andreas Marek committed
223
224
     success = .false.
     return
225
     endif
226
227
   endif

228

229
230
231
   if (present(THIS_REAL_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
232
   else
233

234
235
236
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
Andreas Marek's avatar
Andreas Marek committed
237
238
239
240
   endif

   ! check whether choosen kernel is allowed
   if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
241

242
243
244
245
246
247
248
249
250
251
252
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
253

254
255
256
257
       write(error_unit,*) " "
       write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !"
     endif
     THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
258
259

   endif
260
261

   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
Andreas Marek's avatar
Andreas Marek committed
262
   ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
263
   ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
Andreas Marek's avatar
Andreas Marek committed
264
265
   ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
   ! on this and maybe allow a run-time optimization here
266
   nbw = (63/nblk+1)*nblk
267
268
269
270
271
272
273
274
275

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
276
   call bandred_real(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
277
                     tmat, wantDebug, success, useQRActual)
278
   if (.not.(success)) return
279
   ttt1 = MPI_Wtime()
280
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
281
      write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
282
283
284
285
286
287

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
288
289
   call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, &
                          mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
290
   ttt1 = MPI_Wtime()
291
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
292
      write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
293
294
295
296
297
298
299
300
301
302

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
303
   call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
304
                    mpi_comm_cols, wantDebug, success)
305
306
   if (.not.(success)) return

307
   ttt1 = MPI_Wtime()
308
309
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
     write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
310
311
312
313
314
315
316
317
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   deallocate(e)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
318
319
320
   call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, &
                                    mpi_comm_rows, mpi_comm_cols, wantDebug, success,      &
                                    THIS_REAL_ELPA_KERNEL)
321
   if (.not.(success)) return
322
   ttt1 = MPI_Wtime()
323
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
324
      write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
325
326
327
328
329
330
331

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_real)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
332
   call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, &
333
                                   mpi_comm_cols, useQRActual)
334
   ttt1 = MPI_Wtime()
335
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
336
      write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
337
338
339
   time_evp_back = ttt1-ttts

   deallocate(tmat)
340
341
342
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_real_2stage")
#endif
343
344
1  format(a,f10.3)

345
end function solve_evp_real_2stage
346
347
348
349
350

!-------------------------------------------------------------------------------

!-------------------------------------------------------------------------------

351
function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
352
                                  matrixCols, mpi_comm_rows, mpi_comm_cols,      &
353
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
354
355
356
357
358
359
360
361
362
363

!-------------------------------------------------------------------------------
!  solve_evp_complex_2stage: Solves the complex eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
364
!  a(lda,matrixCols)    Distributed matrix for which eigenvalues are to be computed.
365
366
367
368
369
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
370
!  matrixCols  local columns of matrix a and q
371
372
373
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
374
!  q(ldq,matrixCols)    On output: Eigenvectors of a
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
390
391
392
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
393
   implicit none
Andreas Marek's avatar
Andreas Marek committed
394
395
   integer, intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
   integer                       :: THIS_COMPLEX_ELPA_KERNEL
396
397
   integer, intent(in)           :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
   complex*16, intent(inout)     :: a(lda,matrixCols), q(ldq,matrixCols)
398
   real*8, intent(inout)         :: ev(na)
399
   complex*16, allocatable       :: hh_trans_complex(:,:)
400
401
402
403
404
405
406

   integer                       :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes
   integer                       :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
   complex*16, allocatable       :: tmat(:,:,:)
   real*8, allocatable           :: q_real(:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
407

408
409
410
   logical                       :: success, wantDebug
   logical, save                 :: firstCall = .true.

411
412
413
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_complex_2stage")
#endif
Andreas Marek's avatar
Andreas Marek committed
414
415
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
416
417
418
419
420

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
421

422
423
424
425
426
427
428
429
   wantDebug = .false.
   if (firstCall) then
     ! are debug messages desired?
     wantDebug = debug_messages_via_environment_variable()
     firstCall = .false.
   endif


430
431
   success = .true.

432
433
434
   if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
435
   else
436
437
438
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel()
Andreas Marek's avatar
Andreas Marek committed
439
   endif
440

Andreas Marek's avatar
Andreas Marek committed
441
442
   ! check whether choosen kernel is allowed
   if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then
443

444
445
446
447
448
449
450
451
452
453
454
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
455

456
457
458
459
       write(error_unit,*) " "
       write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
     endif
     THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
460
   endif
461
462
463
464
465
466
467
468
469
470
471
472
   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

   nbw = (31/nblk+1)*nblk

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
473
   call bandred_complex(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
474
                        tmat, wantDebug, success)
475
476
477
478
479
480
   if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
     call timer%stop()
#endif
     return
   endif
481
   ttt1 = MPI_Wtime()
482
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
483
      write(error_unit,*) 'Time bandred_complex               :',ttt1-ttt0
484
485
486
487
488
489

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
490
491
   call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_complex, &
                             mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
492
   ttt1 = MPI_Wtime()
493
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
494
      write(error_unit,*) 'Time tridiag_band_complex          :',ttt1-ttt0
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
   l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev

   allocate(q_real(l_rows,l_cols))

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
511
   call solve_tridi(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, &
512
                    mpi_comm_rows, mpi_comm_cols, wantDebug, success)
513
514
   if (.not.(success)) return

515
   ttt1 = MPI_Wtime()
516
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times)  &
517
      write(error_unit,*) 'Time solve_tridi                   :',ttt1-ttt0
518
519
520
521
522
523
524
525
526
527
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)

   deallocate(e, q_real)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
528
529
530
   call trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq,   &
                                       matrixCols, hh_trans_complex, &
                                       mpi_comm_rows, mpi_comm_cols, &
531
                                       wantDebug, success,THIS_COMPLEX_ELPA_KERNEL)
532
   if (.not.(success)) return
533
   ttt1 = MPI_Wtime()
534
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
535
      write(error_unit,*) 'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0
536
537
538
539
540
541
542

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_complex)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
543
544
   call trans_ev_band_to_full_complex(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, &
                                      mpi_comm_rows, mpi_comm_cols)
545
   ttt1 = MPI_Wtime()
546
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
547
      write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0
548
549
550
   time_evp_back = ttt1-ttts

   deallocate(tmat)
551
552
553
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_complex_2stage")
#endif
554
555
556

1  format(a,f10.3)

557
end function solve_evp_complex_2stage
558
559
560

!-------------------------------------------------------------------------------

561
subroutine bandred_real(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
562
                        tmat, wantDebug, success, useQR)
563
564
565
566
567
568
569
570

!-------------------------------------------------------------------------------
!  bandred_real: Reduces a distributed symmetric matrix to band form
!
!  Parameters
!
!  na          Order of matrix
!
571
!  a(lda,matrixCols)    Distributed matrix which should be reduced.
572
573
574
575
576
577
!              Distribution is like in Scalapack.
!              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
!              a(:,:) is overwritten on exit with the band and the Householder vectors
!              in the upper half.
!
!  lda         Leading dimension of a
578
!  matrixCols  local columns of matrix a
579
580
581
582
583
584
585
586
587
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  nbw         semi bandwith of output matrix
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!
588
!  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
589
590
591
!              Factors for the Householder vectors (returned), needed for back transformation
!
!-------------------------------------------------------------------------------
592
593
#ifdef HAVE_DETAILED_TIMINGS
 use timings
594
595
596
#endif
#ifdef WITH_OPENMP
   use omp_lib
597
#endif
598
   implicit none
599

600
601
   integer             :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
   real*8              :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
602

603
604
   integer             :: my_prow, my_pcol, np_rows, np_cols, mpierr
   integer             :: l_cols, l_rows
605
606
   integer             :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow
   integer             :: istep, ncol, lch, lcx, nlc, mynlc
607
   integer             :: tile_size, l_rows_tile, l_cols_tile
608

609
   real*8              :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
610

611
   real*8, allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:)
612

613
614
615
616
617
   ! needed for blocked QR decomposition
   integer             :: PQRPARAM(11), work_size
   real*8              :: dwork_size(1)
   real*8, allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)

618
   logical, intent(in) :: wantDebug
619
620
   logical, intent(out):: success

621
622
   logical, intent(in) :: useQR

623
624
   integer :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, ii, pp, transformChunkSize

625
626
627
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("bandred_real")
#endif
628
629
630
631
   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
632
   success = .true.
633
634


635
   ! Semibandwith nbw must be a multiple of blocksize nblk
636
637
   if (mod(nbw,nblk)/=0) then
     if (my_prow==0 .and. my_pcol==0) then
638
639
640
641
       if (wantDebug) then
         write(error_unit,*) 'ELPA2_bandred_real: ERROR: nbw=',nbw,', nblk=',nblk
         write(error_unit,*) 'ELPA2_bandred_real: ELPA2 works only for nbw==n*nblk'
       endif
642
       success = .false.
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
643
       return
644
     endif
645
646
647
648
649
650
651
652
653
654
   endif

   ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

   tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
   tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

   l_rows_tile = tile_size/np_rows ! local rows of a tile
   l_cols_tile = tile_size/np_cols ! local cols of a tile

655
656
657
658
659
660
661
   if (useQR) then
     if (which_qr_decomposition == 1) then
       call qr_pqrparam_init(pqrparam,    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
       allocate(tauvector(na))
       allocate(blockheuristic(nblk))
       l_rows = local_index(na, my_prow, np_rows, nblk, -1)
       allocate(vmr(max(l_rows,1),na))
662

663
       call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), tmat(1,1,1), nbw, dwork_size(1), -1, na, &
664
                             nbw, nblk, nblk, na, na, 1, 0, PQRPARAM, mpi_comm_rows, mpi_comm_cols, blockheuristic)
665
666
       work_size = dwork_size(1)
       allocate(work_blocked(work_size))
667

668
669
670
       work_blocked = 0.0d0
       deallocate(vmr)
     endif
671
672
   endif

673
674
   do istep = (na-1)/nbw, 1, -1

675
     n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
676

677
678
679
     ! Number of local columns/rows of remaining matrix
     l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
     l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
680

681
     ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
682

683
684
     allocate(vmr(max(l_rows,1),2*n_cols))
     allocate(umc(max(l_cols,1),2*n_cols))
685

686
     allocate(vr(l_rows+1))
687

688
689
690
     vmr(1:l_rows,1:n_cols) = 0.
     vr(:) = 0
     tmat(:,:,istep) = 0
691

692
     ! Reduce current block to lower triangular form
693
694
695
696
697
698
699
700
701
702

     if (useQR) then
       if (which_qr_decomposition == 1) then
         call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), &
                                  tmat(1,1,istep), nbw, work_blocked,       &
                                  work_size, na, n_cols, nblk, nblk,        &
                                  istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                  0, PQRPARAM, mpi_comm_rows, mpi_comm_cols,&
                                  blockheuristic)
       endif
703
     else
704

705
       do lc = n_cols, 1, -1
706

707
708
         ncol = istep*nbw + lc ! absolute column number of householder vector
         nrow = ncol - nbw ! Absolute number of pivot row
709

710
711
         lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
         lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
712

713
         tau = 0
714

715
         if (nrow == 1) exit ! Nothing to do
716

717
         cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
718

719
         if (my_pcol==cur_pcol) then
720

721
722
           ! Get vector to be transformed; distribute last element and norm of
           ! remaining elements to all procs in current column
723

724
           vr(1:lr) = a(1:lr,lch) ! vector to be transformed
725

726
           if (my_prow==prow(nrow, nblk, np_rows)) then
727
728
729
730
731
732
             aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
             aux1(2) = vr(lr)
           else
             aux1(1) = dot_product(vr(1:lr),vr(1:lr))
             aux1(2) = 0.
           endif
733

734
           call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
735

736
737
           vnorm2 = aux2(1)
           vrl    = aux2(2)
738

739
           ! Householder transformation
740

741
           call hh_transform_real(vrl, vnorm2, xf, tau)
742

743
           ! Scale vr and store Householder vector for back transformation
744

745
           vr(1:lr) = vr(1:lr) * xf
746
           if (my_prow==prow(nrow, nblk, np_rows)) then
747
748
749
750
751
             a(1:lr-1,lch) = vr(1:lr-1)
             a(lr,lch) = vrl
             vr(lr) = 1.
           else
             a(1:lr,lch) = vr(1:lr)
752
           endif
753

754
         endif
755

756
         ! Broadcast Householder vector and tau along columns
757

758
759
760
761
762
         vr(lr+1) = tau
         call MPI_Bcast(vr,lr+1,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
         vmr(1:lr,lc) = vr(1:lr)
         tau = vr(lr+1)
         tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
763

764
765
         ! Transform remaining columns in current block with Householder vector
         ! Local dot product
766

767
         aux1 = 0
768
769
770
771
772
773
774
775
776
#ifdef WITH_OPENMP
         !Open up one omp region to avoid paying openmp overhead.
         !This does not help performance due to the addition of two openmp barriers around the MPI call,
         !But in the future this may be beneficial if these barriers are replaced with a faster implementation

         !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
         mynlc = 0 ! number of local columns

         !This loop does not have independent iterations,
777
         !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
778
         !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
779
         !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
780
781
782
783
784
785
786
787
788
789
         !the iteration number modulo the number of threads
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0 ) then
             mynlc = mynlc+1
             if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
                 if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           endif
         enddo
790

791
792
         ! Get global dot products
         !$omp barrier
793
         !$omp single
794
         if (mynlc>0) call mpi_allreduce(aux1,aux2,mynlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
795
         !$omp end single
796
         !$omp barrier
797

798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
         ! Transform
         transformChunkSize=32
         mynlc = 0
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             mynlc = mynlc+1
             !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
             !However, for some reason this is slower than doing it manually, so it is parallelized as below.
             do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
                do pp = 1,transformChunkSize
                    if (pp + ii > lr) exit
                        a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
                enddo
             enddo
           endif
         enddo
         !$omp end parallel
#else
817
818
819
820
821
822
823
824
         nlc = 0 ! number of local columns
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             nlc = nlc+1
             if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
           endif
         enddo
825

826
827
         ! Get global dot products
         if (nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
828

829
         ! Transform
830

831
832
833
834
835
836
837
838
         nlc = 0
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             nlc = nlc+1
             a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
           endif
         enddo
839
#endif
840
       enddo
841

842
843
       ! Calculate scalar products of stored Householder vectors.
       ! This can be done in different ways, we use dsyrk
844

845
846
       vav = 0
       if (l_rows>0) &
847
           call dsyrk('U','T',n_cols,l_rows,1.d0,vmr,ubound(vmr,dim=1),0.d0,vav,ubound(vav,dim=1))
848
       call symm_matrix_allreduce(n_cols,vav, nbw, nbw,mpi_comm_rows)
849

850
       ! Calculate triangular matrix T for block Householder Transformation
851

852
853
854
       do lc=n_cols,1,-1
         tau = tmat(lc,lc,istep)
         if (lc<n_cols) then
855
           call dtrmv('U','T','N',n_cols-lc,tmat(lc+1,lc+1,istep),ubound(tmat,dim=1),vav(lc+1,lc),1)
856
857
858
           tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
         endif
       enddo
859
     endif
860

861
    ! Transpose vmr -> vmc (stored in umc, second half)
862

863
864
    call elpa_transpose_vectors_real  (vmr, ubound(vmr,dim=1), mpi_comm_rows, &
                                    umc(1,n_cols+1), ubound(umc,dim=1), mpi_comm_cols, &
865
866
                                    1, istep*nbw, n_cols, nblk)

867
868
869
870
    ! Calculate umc = A**T * vmr
    ! Note that the distributed A has to be transposed
    ! Opposed to direct tridiagonalization there is no need to use the cache locality
    ! of the tiles, so we can use strips of the matrix
871
    !Code for Algorithm 4
872

873
874
    n_way = 1
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
875
    n_way = omp_get_max_threads()
876
877
878
#endif
    !umc(1:l_cols,1:n_cols) = 0.d0
    !vmr(1:l_rows,n_cols+1:2*n_cols) = 0
Andreas Marek's avatar
Andreas Marek committed
879
#ifdef WITH_OPENMP
880
    !$omp parallel private( i,lcs,lce,lrs,lre)
Andreas Marek's avatar
Andreas Marek committed
881
#endif
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
    if(n_way > 1) then
        !$omp do
        do i=1,min(l_cols_tile, l_cols)
            umc(i,1:n_cols) = 0.d0
        enddo
        !$omp do
        do i=1,l_rows
            vmr(i,n_cols+1:2*n_cols) = 0.d0
        enddo
        if (l_cols>0 .and. l_rows>0) then

          !SYMM variant 4
          !Partitioned Matrix Expression:
          ! Ct = Atl Bt + Atr Bb
          ! Cb = Atr' Bt + Abl Bb
          !
          !Loop invariant:
          ! Ct = Atl Bt + Atr Bb
          !
          !Update:
          ! C1 = A10'B0 + A11B1 + A21 B2
          !
          !This algorithm chosen because in this algoirhtm, the loop around the dgemm calls
          !is easily parallelized, and regardless of choise of algorithm,
          !the startup cost for parallelizing the dgemms inside the loop is too great

          !$omp do schedule(static,1)
          do i=0,(istep*nbw-1)/tile_size
            lcs = i*l_cols_tile+1                   ! local column start
            lce = min(l_cols, (i+1)*l_cols_tile)    ! local column end

            lrs = i*l_rows_tile+1                   ! local row start
            lre = min(l_rows, (i+1)*l_rows_tile)    ! local row end

            !C1 += [A11 A12] [B1
            !                 B2]
            if( lre > lrs .and. l_cols > lcs ) then
            call DGEMM('N','N', lre-lrs+1, n_cols, l_cols-lcs+1,    &
Andreas Marek's avatar
Andreas Marek committed
920
921
922
                       1.d0, a(lrs,lcs), ubound(a,dim=1),           &
                             umc(lcs,n_cols+1), ubound(umc,dim=1),  &
                       0.d0, vmr(lrs,n_cols+1), ubound(vmr,dim=1))
923
924
925
926
927
            endif

            ! C1 += A10' B0
            if( lce > lcs .and. i > 0 ) then
            call DGEMM('T','N', lce-lcs+1, n_cols, lrs-1,           &
Andreas Marek's avatar
Andreas Marek committed
928
929
930
                       1.d0, a(1,lcs),   ubound(a,dim=1),           &
                             vmr(1,1),   ubound(vmr,dim=1),         &
                       0.d0, umc(lcs,1), ubound(umc,dim=1))
931
932
933
934
935
936
937
938
939
940
941
942
943
944
            endif
          enddo
        endif
    else
        umc(1:l_cols,1:n_cols) = 0.d0
        vmr(1:l_rows,n_cols+1:2*n_cols) = 0
        if (l_cols>0 .and. l_rows>0) then
          do i=0,(istep*nbw-1)/tile_size

            lcs = i*l_cols_tile+1
            lce = min(l_cols,(i+1)*l_cols_tile)
            if (lce<lcs) cycle

            lre = min(l_rows,(i+1)*l_rows_tile)
Andreas Marek's avatar
Andreas Marek committed
945
            call DGEMM('T','N',lce-lcs+1,n_cols,lre,1.d0,a(1,lcs),ubound(a,dim=1), &
946
                         vmr,ubound(vmr,dim=1),1.d0,umc(lcs,1),ubound(umc,dim=1))
947
948
949
950

            if (i==0) cycle
            lre = min(l_rows,i*l_rows_tile)
            call DGEMM('N','N',lre,n_cols,lce-lcs+1,1.d0,a(1,lcs),lda, &
Andreas Marek's avatar
Andreas Marek committed
951
                         umc(lcs,n_cols+1),ubound(umc,dim=1),1.d0,vmr(1,n_cols+1),ubound(vmr,dim=1))
952
953
          enddo
        endif
954
    endif
Andreas Marek's avatar
Andreas Marek committed
955
#ifdef WITH_OPENMP
956
    !$omp end parallel
Andreas Marek's avatar
Andreas Marek committed
957
#endif
958
959
960
961
    ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
    ! on the processors containing the diagonal
    ! This is only necessary if ur has been calculated, i.e. if the
    ! global tile size is smaller than the global remaining matrix
962
963
    ! Or if we used the Algorithm 4
    if (tile_size < istep*nbw .or. n_way > 1) then
964
965
966
    call elpa_reduce_add_vectors_real  (vmr(1,n_cols+1),ubound(vmr,dim=1),mpi_comm_rows, &
                                        umc, ubound(umc,dim=1), mpi_comm_cols, &
                                        istep*nbw, n_cols, nblk)
967
    endif
968

969
970
971
972
973
974
    if (l_cols>0) then
      allocate(tmp(l_cols,n_cols))
      call mpi_allreduce(umc,tmp,l_cols*n_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
      umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
      deallocate(tmp)
    endif
975

976
    ! U = U * Tmat**T
977

978
    call dtrmm('Right','Upper','Trans','Nonunit',l_cols,n_cols,1.d0,tmat(1,1,istep),ubound(tmat,dim=1),umc,ubound(umc,dim=1))
979

980
    ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
981

Andreas Marek's avatar
Andreas Marek committed
982
983
984
985
    call dgemm('T','N',n_cols,n_cols,l_cols,1.d0,umc,ubound(umc,dim=1),umc(1,n_cols+1), &
               ubound(umc,dim=1),0.d0,vav,ubound(vav,dim=1))
    call dtrmm('Right','Upper','Trans','Nonunit',n_cols,n_cols,1.d0,tmat(1,1,istep),    &
               ubound(tmat,dim=1),vav,ubound(vav,dim=1))
986

987
    call symm_matrix_allreduce(n_cols,vav, nbw, nbw ,mpi_comm_cols)
988

989
    ! U = U - 0.5 * V * VAV
Andreas Marek's avatar
Andreas Marek committed
990
991
    call dgemm('N','N',l_cols,n_cols,n_cols,-0.5d0,umc(1,n_cols+1),ubound(umc,dim=1),vav, &
                ubound(vav,dim=1),1.d0,umc,ubound(umc,dim=1))
992

993
    ! Transpose umc -> umr (stored in vmr, second half)
994

995
996
    call elpa_transpose_vectors_real  (umc, ubound(umc,dim=1), mpi_comm_cols, &
                                   vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, &
997
                                   1, istep*nbw, n_cols, nblk)
998

999
    ! A = A - V*U**T - U*V**T
1000
1001
1002
1003
1004
1005
1006
1007
#ifdef WITH_OPENMP
    !$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend  )
    n_threads = omp_get_num_threads()
    if(mod(n_threads, 2) == 0) then
        n_way = 2
    else
        n_way = 1
    endif
1008

1009
    m_way = n_threads / n_way
Andreas Marek's avatar
Andreas Marek committed
1010

1011
1012
    m_id = mod(omp_get_thread_num(),  m_way)
    n_id = omp_get_thread_num() / m_way
Andreas Marek's avatar
Andreas Marek committed
1013

1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
    do ii=n_id*tile_size,(istep*nbw-1),tile_size*n_way
      i = ii / tile_size
      lcs = i*l_cols_tile+1
      lce = min(l_cols,(i+1)*l_cols_tile)
      lre = min(l_rows,(i+1)*l_rows_tile)
      if (lce<lcs .or. lre<1) cycle

      !Figure out this thread's range
      work_per_thread = lre / m_way
      if (work_per_thread * m_way < lre) work_per_thread = work_per_thread + 1
      mystart = m_id * work_per_thread + 1
      myend   = mystart + work_per_thread - 1
      if( myend > lre ) myend = lre
      if( myend-mystart+1 < 1) cycle

      call dgemm('N','T',myend-mystart+1, lce-lcs+1, 2*n_cols, -1.d0, &
                  vmr(mystart, 1), ubound(vmr,1), umc(lcs,1), ubound(umc,1), &
                  1.d0,a(mystart,lcs),ubound(a,1))
    enddo
    !$omp end parallel
1034

Andreas Marek's avatar
Andreas Marek committed
1035
#else /* WITH_OPENMP */
1036
1037
1038
1039
1040
1041
    do i=0,(istep*nbw-1)/tile_size
      lcs = i*l_cols_tile+1
      lce = min(l_cols,(i+1)*l_cols_tile)
      lre = min(l_rows,(i+1)*l_rows_tile)
      if (lce<lcs .or. lre<1) cycle
      call dgemm('N','T',lre,lce-lcs+1,2*n_cols,-1.d0, &
1042
                  vmr,ubound(vmr,dim=1),umc(lcs,1),ubound(umc,dim=1), &
1043
1044
                  1.d0,a(1,lcs),lda)
    enddo
Andreas Marek's avatar
Andreas Marek committed
1045
#endif /* WITH_OPENMP */
1046
    deallocate(vmr, umc, vr)
1047

1048
  enddo
1049

1050
1051
1052
1053
1054
  if (useQR) then
    if (which_qr_decomposition == 1) then
      deallocate(work_blocked)
      deallocate(tauvector)
    endif
1055
  endif