elpa2.F90 212 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
17
!    This particular source code file contains additions, changes and
Andreas Marek's avatar
Andreas Marek committed
18
!    enhancements authored by Intel Corporation which is not part of
19
!    the ELPA consortium.
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
!
!    More information can be found here:
!    http://elpa.rzg.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".


#include "config-f90.h"

module ELPA2

! Version 1.1.2, 2011-02-21

68
  use elpa_utilities
69
  USE ELPA1
70
  use elpa2_utilities
71
72
  use elpa_pdgeqrf

73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
  implicit none

  PRIVATE ! By default, all routines contained are private

  ! The following routines are public:

  public :: solve_evp_real_2stage
  public :: solve_evp_complex_2stage

  public :: bandred_real
  public :: tridiag_band_real
  public :: trans_ev_tridi_to_band_real
  public :: trans_ev_band_to_full_real

  public :: bandred_complex
  public :: tridiag_band_complex
  public :: trans_ev_tridi_to_band_complex
  public :: trans_ev_band_to_full_complex
91

92
93
94
95
96
97
  public :: band_band_real
  public :: divide_band

  integer, public :: which_qr_decomposition = 1     ! defines, which QR-decomposition algorithm will be used
                                                    ! 0 for unblocked
                                                    ! 1 for blocked (maxrank: nblk)
98
99
100
101
102
103
104
105
!-------------------------------------------------------------------------------

  ! The following array contains the Householder vectors of the
  ! transformation band -> tridiagonal.
  ! It is allocated and set in tridiag_band_real and used in
  ! trans_ev_tridi_to_band_real.
  ! It must be deallocated by the user after trans_ev_tridi_to_band_real!

106
107
!  real*8, allocatable :: hh_trans_real(:,:)
!  complex*16, allocatable :: hh_trans_complex(:,:)
108
109
110
111
112
113
114
115

!-------------------------------------------------------------------------------

  include 'mpif.h'


!******
contains
116

117
function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk,        &
118
                               matrixCols,                               &
119
120
121
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
                                 useQR) result(success)
122
123
124
125
126
127
128
129
130
131

!-------------------------------------------------------------------------------
!  solve_evp_real_2stage: Solves the real eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
132
!  a(lda,matrixCols)    Distributed matrix for which eigenvalues are to be computed.
133
134
135
136
137
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
138
!  matrixCols  local columns of matrix a and q
139
140
141
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
142
!  q(ldq,matrixCols)    On output: Eigenvectors of a
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
158
159
160
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
161
   implicit none
162
163
   logical, intent(in), optional :: useQR
   logical                       :: useQRActual, useQREnvironment
Andreas Marek's avatar
Andreas Marek committed
164
   integer, intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
165
   integer                       :: THIS_REAL_ELPA_KERNEL
166

167
   integer, intent(in)           :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, &
168
                                    mpi_comm_cols, mpi_comm_all
169
   integer, intent(in)           :: nblk
170
   real*8, intent(inout)         :: a(lda,matrixCols), ev(na), q(ldq,matrixCols)
171
   real*8, allocatable           :: hh_trans_real(:,:)
172

173
174
175
176
177
178
   integer                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer                       :: nbw, num_blocks
   real*8, allocatable           :: tmat(:,:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
   logical                       :: success
179
180
   logical, save                 :: firstCall = .true.
   logical                       :: wantDebug
181

182
183
184
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_real_2stage")
#endif
185
186
187
188
189
190
191
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
192

193
194
195
196
197
198
199
200

   wantDebug = .false.
   if (firstCall) then
     ! are debug messages desired?
     wantDebug = debug_messages_via_environment_variable()
     firstCall = .false.
   endif

201
202
   success = .true.

203
204
205
206
207
208
209
210
211
212
213
214
215
   useQRActual = .false.

   ! set usage of qr decomposition via API call
   if (present(useQR)) then
     if (useQR) useQRActual = .true.
     if (.not.(useQR)) useQRACtual = .false.
   endif

   ! overwrite this with environment variable settings
   if (qr_decomposition_via_environment_variable(useQREnvironment)) then
     useQRActual = useQREnvironment
   endif

216
   if (useQRActual) then
217
218
219
220
     if (mod(na,nblk) .ne. 0) then
       if (wantDebug) then
         write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
       endif
Andreas Marek's avatar
Andreas Marek committed
221
     print *, "Do not use QR-decomposition for this matrix and blocksize."
Andreas Marek's avatar
Andreas Marek committed
222
223
     success = .false.
     return
224
     endif
225
226
   endif

227

228
229
230
   if (present(THIS_REAL_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
231
   else
232

233
234
235
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
Andreas Marek's avatar
Andreas Marek committed
236
237
238
239
   endif

   ! check whether choosen kernel is allowed
   if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
240

241
242
243
244
245
246
247
248
249
250
251
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
252

253
254
255
256
       write(error_unit,*) " "
       write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !"
     endif
     THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
257
258

   endif
259
260

   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
Andreas Marek's avatar
Andreas Marek committed
261
   ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
262
   ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
Andreas Marek's avatar
Andreas Marek committed
263
264
   ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
   ! on this and maybe allow a run-time optimization here
265
   nbw = (63/nblk+1)*nblk
266
267
268
269
270
271
272
273
274

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
275
   call bandred_real(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
276
                     tmat, wantDebug, success, useQRActual)
277
   if (.not.(success)) return
278
   ttt1 = MPI_Wtime()
279
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
280
      write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
281
282
283
284
285
286

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
287
288
   call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, &
                          mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
289
   ttt1 = MPI_Wtime()
290
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
291
      write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
292
293
294
295
296
297
298
299
300
301

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
302
   call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
303
                    mpi_comm_cols, wantDebug, success)
304
305
   if (.not.(success)) return

306
   ttt1 = MPI_Wtime()
307
308
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
     write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
309
310
311
312
313
314
315
316
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   deallocate(e)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
317
318
319
   call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, &
                                    mpi_comm_rows, mpi_comm_cols, wantDebug, success,      &
                                    THIS_REAL_ELPA_KERNEL)
320
   if (.not.(success)) return
321
   ttt1 = MPI_Wtime()
322
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
323
      write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
324
325
326
327
328
329
330

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_real)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
331
   call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, &
332
                                   mpi_comm_cols, useQRActual)
333
   ttt1 = MPI_Wtime()
334
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
335
      write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
336
337
338
   time_evp_back = ttt1-ttts

   deallocate(tmat)
339
340
341
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_real_2stage")
#endif
342
343
1  format(a,f10.3)

344
end function solve_evp_real_2stage
345
346
347
348
349

!-------------------------------------------------------------------------------

!-------------------------------------------------------------------------------

350
function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
351
                                  matrixCols, mpi_comm_rows, mpi_comm_cols,      &
352
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
353
354
355
356
357
358
359
360
361
362

!-------------------------------------------------------------------------------
!  solve_evp_complex_2stage: Solves the complex eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
363
!  a(lda,matrixCols)    Distributed matrix for which eigenvalues are to be computed.
364
365
366
367
368
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
369
!  matrixCols  local columns of matrix a and q
370
371
372
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
373
!  q(ldq,matrixCols)    On output: Eigenvectors of a
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
389
390
391
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
392
   implicit none
Andreas Marek's avatar
Andreas Marek committed
393
394
   integer, intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
   integer                       :: THIS_COMPLEX_ELPA_KERNEL
395
396
   integer, intent(in)           :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
   complex*16, intent(inout)     :: a(lda,matrixCols), q(ldq,matrixCols)
397
   real*8, intent(inout)         :: ev(na)
398
   complex*16, allocatable       :: hh_trans_complex(:,:)
399
400
401
402
403
404
405

   integer                       :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes
   integer                       :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
   complex*16, allocatable       :: tmat(:,:,:)
   real*8, allocatable           :: q_real(:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
406

407
408
409
   logical                       :: success, wantDebug
   logical, save                 :: firstCall = .true.

410
411
412
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_complex_2stage")
#endif
Andreas Marek's avatar
Andreas Marek committed
413
414
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
415
416
417
418
419

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
420

421
422
423
424
425
426
427
428
   wantDebug = .false.
   if (firstCall) then
     ! are debug messages desired?
     wantDebug = debug_messages_via_environment_variable()
     firstCall = .false.
   endif


429
430
   success = .true.

431
432
433
   if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
434
   else
435
436
437
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel()
Andreas Marek's avatar
Andreas Marek committed
438
   endif
439

Andreas Marek's avatar
Andreas Marek committed
440
441
   ! check whether choosen kernel is allowed
   if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then
442

443
444
445
446
447
448
449
450
451
452
453
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
454

455
456
457
458
       write(error_unit,*) " "
       write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
     endif
     THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
459
   endif
460
461
462
463
464
465
466
467
468
469
470
471
   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

   nbw = (31/nblk+1)*nblk

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
472
   call bandred_complex(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
473
                        tmat, wantDebug, success)
474
475
476
477
478
479
   if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
     call timer%stop()
#endif
     return
   endif
480
   ttt1 = MPI_Wtime()
481
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
482
      write(error_unit,*) 'Time bandred_complex               :',ttt1-ttt0
483
484
485
486
487
488

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
489
490
   call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_complex, &
                             mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
491
   ttt1 = MPI_Wtime()
492
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
493
      write(error_unit,*) 'Time tridiag_band_complex          :',ttt1-ttt0
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
   l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev

   allocate(q_real(l_rows,l_cols))

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
510
   call solve_tridi(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, &
511
                    mpi_comm_rows, mpi_comm_cols, wantDebug, success)
512
513
   if (.not.(success)) return

514
   ttt1 = MPI_Wtime()
515
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times)  &
516
      write(error_unit,*) 'Time solve_tridi                   :',ttt1-ttt0
517
518
519
520
521
522
523
524
525
526
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)

   deallocate(e, q_real)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
527
528
529
   call trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq,   &
                                       matrixCols, hh_trans_complex, &
                                       mpi_comm_rows, mpi_comm_cols, &
530
                                       wantDebug, success,THIS_COMPLEX_ELPA_KERNEL)
531
   if (.not.(success)) return
532
   ttt1 = MPI_Wtime()
533
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
534
      write(error_unit,*) 'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0
535
536
537
538
539
540
541

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_complex)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
542
543
   call trans_ev_band_to_full_complex(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, &
                                      mpi_comm_rows, mpi_comm_cols)
544
   ttt1 = MPI_Wtime()
545
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
546
      write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0
547
548
549
   time_evp_back = ttt1-ttts

   deallocate(tmat)
550
551
552
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_complex_2stage")
#endif
553
554
555

1  format(a,f10.3)

556
end function solve_evp_complex_2stage
557
558
559

!-------------------------------------------------------------------------------

560
subroutine bandred_real(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
561
                        tmat, wantDebug, success, useQR)
562
563
564
565
566
567
568
569

!-------------------------------------------------------------------------------
!  bandred_real: Reduces a distributed symmetric matrix to band form
!
!  Parameters
!
!  na          Order of matrix
!
570
!  a(lda,matrixCols)    Distributed matrix which should be reduced.
571
572
573
574
575
576
!              Distribution is like in Scalapack.
!              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
!              a(:,:) is overwritten on exit with the band and the Householder vectors
!              in the upper half.
!
!  lda         Leading dimension of a
577
!  matrixCols  local columns of matrix a
578
579
580
581
582
583
584
585
586
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  nbw         semi bandwith of output matrix
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!
587
!  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
588
589
590
!              Factors for the Householder vectors (returned), needed for back transformation
!
!-------------------------------------------------------------------------------
591
592
#ifdef HAVE_DETAILED_TIMINGS
 use timings
593
594
595
#endif
#ifdef WITH_OPENMP
   use omp_lib
596
#endif
597
   implicit none
598

599
600
   integer             :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
   real*8              :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
601

602
603
   integer             :: my_prow, my_pcol, np_rows, np_cols, mpierr
   integer             :: l_cols, l_rows
604
605
   integer             :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow
   integer             :: istep, ncol, lch, lcx, nlc, mynlc
606
   integer             :: tile_size, l_rows_tile, l_cols_tile
607

608
   real*8              :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
609

610
   real*8, allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:)
611

612
613
614
615
616
   ! needed for blocked QR decomposition
   integer             :: PQRPARAM(11), work_size
   real*8              :: dwork_size(1)
   real*8, allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)

617
   logical, intent(in) :: wantDebug
618
619
   logical, intent(out):: success

620
621
   logical, intent(in) :: useQR

622
623
   integer :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, ii, pp, transformChunkSize

624
625
626
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("bandred_real")
#endif
627
628
629
630
   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
631
   success = .true.
632
633


634
   ! Semibandwith nbw must be a multiple of blocksize nblk
635
636
   if (mod(nbw,nblk)/=0) then
     if (my_prow==0 .and. my_pcol==0) then
637
638
639
640
       if (wantDebug) then
         write(error_unit,*) 'ELPA2_bandred_real: ERROR: nbw=',nbw,', nblk=',nblk
         write(error_unit,*) 'ELPA2_bandred_real: ELPA2 works only for nbw==n*nblk'
       endif
641
       success = .false.
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
642
       return
643
     endif
644
645
646
647
648
649
650
651
652
653
   endif

   ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

   tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
   tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

   l_rows_tile = tile_size/np_rows ! local rows of a tile
   l_cols_tile = tile_size/np_cols ! local cols of a tile

654
655
656
657
658
659
660
   if (useQR) then
     if (which_qr_decomposition == 1) then
       call qr_pqrparam_init(pqrparam,    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
       allocate(tauvector(na))
       allocate(blockheuristic(nblk))
       l_rows = local_index(na, my_prow, np_rows, nblk, -1)
       allocate(vmr(max(l_rows,1),na))
661

662
       call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), tmat(1,1,1), nbw, dwork_size(1), -1, na, &
663
                             nbw, nblk, nblk, na, na, 1, 0, PQRPARAM, mpi_comm_rows, mpi_comm_cols, blockheuristic)
664
665
       work_size = dwork_size(1)
       allocate(work_blocked(work_size))
666

667
668
669
       work_blocked = 0.0d0
       deallocate(vmr)
     endif
670
671
   endif

672
673
   do istep = (na-1)/nbw, 1, -1

674
     n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
675

676
677
678
     ! Number of local columns/rows of remaining matrix
     l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
     l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
679

680
     ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
681

682
683
     allocate(vmr(max(l_rows,1),2*n_cols))
     allocate(umc(max(l_cols,1),2*n_cols))
684

685
     allocate(vr(l_rows+1))
686

687
688
689
     vmr(1:l_rows,1:n_cols) = 0.
     vr(:) = 0
     tmat(:,:,istep) = 0
690

691
     ! Reduce current block to lower triangular form
692
693
694
695
696
697
698
699
700
701

     if (useQR) then
       if (which_qr_decomposition == 1) then
         call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), &
                                  tmat(1,1,istep), nbw, work_blocked,       &
                                  work_size, na, n_cols, nblk, nblk,        &
                                  istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                  0, PQRPARAM, mpi_comm_rows, mpi_comm_cols,&
                                  blockheuristic)
       endif
702
     else
703

704
       do lc = n_cols, 1, -1
705

706
707
         ncol = istep*nbw + lc ! absolute column number of householder vector
         nrow = ncol - nbw ! Absolute number of pivot row
708

709
710
         lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
         lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
711

712
         tau = 0
713

714
         if (nrow == 1) exit ! Nothing to do
715

716
         cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
717

718
         if (my_pcol==cur_pcol) then
719

720
721
           ! Get vector to be transformed; distribute last element and norm of
           ! remaining elements to all procs in current column
722

723
           vr(1:lr) = a(1:lr,lch) ! vector to be transformed
724

725
           if (my_prow==prow(nrow, nblk, np_rows)) then
726
727
728
729
730
731
             aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
             aux1(2) = vr(lr)
           else
             aux1(1) = dot_product(vr(1:lr),vr(1:lr))
             aux1(2) = 0.
           endif
732

733
           call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
734

735
736
           vnorm2 = aux2(1)
           vrl    = aux2(2)
737

738
           ! Householder transformation
739

740
           call hh_transform_real(vrl, vnorm2, xf, tau)
741

742
           ! Scale vr and store Householder vector for back transformation
743

744
           vr(1:lr) = vr(1:lr) * xf
745
           if (my_prow==prow(nrow, nblk, np_rows)) then
746
747
748
749
750
             a(1:lr-1,lch) = vr(1:lr-1)
             a(lr,lch) = vrl
             vr(lr) = 1.
           else
             a(1:lr,lch) = vr(1:lr)
751
           endif
752

753
         endif
754

755
         ! Broadcast Householder vector and tau along columns
756

757
758
759
760
761
         vr(lr+1) = tau
         call MPI_Bcast(vr,lr+1,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
         vmr(1:lr,lc) = vr(1:lr)
         tau = vr(lr+1)
         tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
762

763
764
         ! Transform remaining columns in current block with Householder vector
         ! Local dot product
765

766
         aux1 = 0
767
768
769
770
771
772
773
774
775
#ifdef WITH_OPENMP
         !Open up one omp region to avoid paying openmp overhead.
         !This does not help performance due to the addition of two openmp barriers around the MPI call,
         !But in the future this may be beneficial if these barriers are replaced with a faster implementation

         !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
         mynlc = 0 ! number of local columns

         !This loop does not have independent iterations,
776
         !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
777
         !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
778
         !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
779
780
781
782
783
784
785
786
787
788
         !the iteration number modulo the number of threads
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0 ) then
             mynlc = mynlc+1
             if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
                 if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           endif
         enddo
789

790
791
         ! Get global dot products
         !$omp barrier
792
         !$omp single
793
         if (mynlc>0) call mpi_allreduce(aux1,aux2,mynlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
794
         !$omp end single
795
         !$omp barrier
796

797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
         ! Transform
         transformChunkSize=32
         mynlc = 0
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             mynlc = mynlc+1
             !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
             !However, for some reason this is slower than doing it manually, so it is parallelized as below.
             do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
                do pp = 1,transformChunkSize
                    if (pp + ii > lr) exit
                        a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
                enddo
             enddo
           endif
         enddo
         !$omp end parallel
#else
816
817
818
819
820
821
822
823
         nlc = 0 ! number of local columns
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             nlc = nlc+1
             if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
           endif
         enddo
824

825
826
         ! Get global dot products
         if (nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
827

828
         ! Transform
829

830
831
832
833
834
835
836
837
         nlc = 0
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             nlc = nlc+1
             a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
           endif
         enddo
838
#endif
839
       enddo
840

841
842
       ! Calculate scalar products of stored Householder vectors.
       ! This can be done in different ways, we use dsyrk
843

844
845
       vav = 0
       if (l_rows>0) &
846
           call dsyrk('U','T',n_cols,l_rows,1.d0,vmr,ubound(vmr,dim=1),0.d0,vav,ubound(vav,dim=1))
847
       call symm_matrix_allreduce(n_cols,vav, nbw, nbw,mpi_comm_rows)
848

849
       ! Calculate triangular matrix T for block Householder Transformation
850

851
852
853
       do lc=n_cols,1,-1
         tau = tmat(lc,lc,istep)
         if (lc<n_cols) then
854
           call dtrmv('U','T','N',n_cols-lc,tmat(lc+1,lc+1,istep),ubound(tmat,dim=1),vav(lc+1,lc),1)
855
856
857
           tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
         endif
       enddo
858
     endif
859

860
    ! Transpose vmr -> vmc (stored in umc, second half)
861

862
863
    call elpa_transpose_vectors_real  (vmr, ubound(vmr,dim=1), mpi_comm_rows, &
                                    umc(1,n_cols+1), ubound(umc,dim=1), mpi_comm_cols, &
864
865
                                    1, istep*nbw, n_cols, nblk)

866
867
868
869
    ! Calculate umc = A**T * vmr
    ! Note that the distributed A has to be transposed
    ! Opposed to direct tridiagonalization there is no need to use the cache locality
    ! of the tiles, so we can use strips of the matrix
870
    !Code for Algorithm 4
871

872
873
    n_way = 1
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
874
    n_way = omp_get_max_threads()
875
876
877
#endif
    !umc(1:l_cols,1:n_cols) = 0.d0
    !vmr(1:l_rows,n_cols+1:2*n_cols) = 0
Andreas Marek's avatar
Andreas Marek committed
878
#ifdef WITH_OPENMP
879
    !$omp parallel private( i,lcs,lce,lrs,lre)
Andreas Marek's avatar
Andreas Marek committed
880
#endif
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
    if(n_way > 1) then
        !$omp do
        do i=1,min(l_cols_tile, l_cols)
            umc(i,1:n_cols) = 0.d0
        enddo
        !$omp do
        do i=1,l_rows
            vmr(i,n_cols+1:2*n_cols) = 0.d0
        enddo
        if (l_cols>0 .and. l_rows>0) then

          !SYMM variant 4
          !Partitioned Matrix Expression:
          ! Ct = Atl Bt + Atr Bb
          ! Cb = Atr' Bt + Abl Bb
          !
          !Loop invariant:
          ! Ct = Atl Bt + Atr Bb
          !
          !Update:
          ! C1 = A10'B0 + A11B1 + A21 B2
          !
          !This algorithm chosen because in this algoirhtm, the loop around the dgemm calls
          !is easily parallelized, and regardless of choise of algorithm,
          !the startup cost for parallelizing the dgemms inside the loop is too great

          !$omp do schedule(static,1)
          do i=0,(istep*nbw-1)/tile_size
            lcs = i*l_cols_tile+1                   ! local column start
            lce = min(l_cols, (i+1)*l_cols_tile)    ! local column end

            lrs = i*l_rows_tile+1                   ! local row start
            lre = min(l_rows, (i+1)*l_rows_tile)    ! local row end

            !C1 += [A11 A12] [B1
            !                 B2]
            if( lre > lrs .and. l_cols > lcs ) then
            call DGEMM('N','N', lre-lrs+1, n_cols, l_cols-lcs+1,    &
Andreas Marek's avatar
Andreas Marek committed
919
920
921
                       1.d0, a(lrs,lcs), ubound(a,dim=1),           &
                             umc(lcs,n_cols+1), ubound(umc,dim=1),  &
                       0.d0, vmr(lrs,n_cols+1), ubound(vmr,dim=1))
922
923
924
925
926
            endif

            ! C1 += A10' B0
            if( lce > lcs .and. i > 0 ) then
            call DGEMM('T','N', lce-lcs+1, n_cols, lrs-1,           &
Andreas Marek's avatar
Andreas Marek committed
927
928
929
                       1.d0, a(1,lcs),   ubound(a,dim=1),           &
                             vmr(1,1),   ubound(vmr,dim=1),         &
                       0.d0, umc(lcs,1), ubound(umc,dim=1))
930
931
932
933
934
935
936
937
938
939
940
941
942
943
            endif
          enddo
        endif
    else
        umc(1:l_cols,1:n_cols) = 0.d0
        vmr(1:l_rows,n_cols+1:2*n_cols) = 0
        if (l_cols>0 .and. l_rows>0) then
          do i=0,(istep*nbw-1)/tile_size

            lcs = i*l_cols_tile+1
            lce = min(l_cols,(i+1)*l_cols_tile)
            if (lce<lcs) cycle

            lre = min(l_rows,(i+1)*l_rows_tile)
Andreas Marek's avatar
Andreas Marek committed
944
            call DGEMM('T','N',lce-lcs+1,n_cols,lre,1.d0,a(1,lcs),ubound(a,dim=1), &
945
                         vmr,ubound(vmr,dim=1),1.d0,umc(lcs,1),ubound(umc,dim=1))
946
947
948
949

            if (i==0) cycle
            lre = min(l_rows,i*l_rows_tile)
            call DGEMM('N','N',lre,n_cols,lce-lcs+1,1.d0,a(1,lcs),lda, &
Andreas Marek's avatar
Andreas Marek committed
950
                         umc(lcs,n_cols+1),ubound(umc,dim=1),1.d0,vmr(1,n_cols+1),ubound(vmr,dim=1))
951
952
          enddo
        endif
953
    endif
Andreas Marek's avatar
Andreas Marek committed
954
#ifdef WITH_OPENMP
955
    !$omp end parallel
Andreas Marek's avatar
Andreas Marek committed
956
#endif
957
958
959
960
    ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
    ! on the processors containing the diagonal
    ! This is only necessary if ur has been calculated, i.e. if the
    ! global tile size is smaller than the global remaining matrix
961
962
    ! Or if we used the Algorithm 4
    if (tile_size < istep*nbw .or. n_way > 1) then
963
964
965
    call elpa_reduce_add_vectors_real  (vmr(1,n_cols+1),ubound(vmr,dim=1),mpi_comm_rows, &
                                        umc, ubound(umc,dim=1), mpi_comm_cols, &
                                        istep*nbw, n_cols, nblk)
966
    endif
967

968
969
970
971
972
973
    if (l_cols>0) then
      allocate(tmp(l_cols,n_cols))
      call mpi_allreduce(umc,tmp,l_cols*n_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
      umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
      deallocate(tmp)
    endif
974

975
    ! U = U * Tmat**T
976

977
    call dtrmm('Right','Upper','Trans','Nonunit',l_cols,n_cols,1.d0,tmat(1,1,istep),ubound(tmat,dim=1),umc,ubound(umc,dim=1))
978

979
    ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
980

Andreas Marek's avatar
Andreas Marek committed
981
982
983
984
    call dgemm('T','N',n_cols,n_cols,l_cols,1.d0,umc,ubound(umc,dim=1),umc(1,n_cols+1), &
               ubound(umc,dim=1),0.d0,vav,ubound(vav,dim=1))
    call dtrmm('Right','Upper','Trans','Nonunit',n_cols,n_cols,1.d0,tmat(1,1,istep),    &
               ubound(tmat,dim=1),vav,ubound(vav,dim=1))
985

986
    call symm_matrix_allreduce(n_cols,vav, nbw, nbw ,mpi_comm_cols)
987

988
    ! U = U - 0.5 * V * VAV
Andreas Marek's avatar
Andreas Marek committed
989
990
    call dgemm('N','N',l_cols,n_cols,n_cols,-0.5d0,umc(1,n_cols+1),ubound(umc,dim=1),vav, &
                ubound(vav,dim=1),1.d0,umc,ubound(umc,dim=1))
991

992
    ! Transpose umc -> umr (stored in vmr, second half)
993

994
995
    call elpa_transpose_vectors_real  (umc, ubound(umc,dim=1), mpi_comm_cols, &
                                   vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, &
996
                                   1, istep*nbw, n_cols, nblk)
997

998
    ! A = A - V*U**T - U*V**T
999
1000
1001
1002
1003
1004
1005
1006
#ifdef WITH_OPENMP
    !$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend  )
    n_threads = omp_get_num_threads()
    if(mod(n_threads, 2) == 0) then
        n_way = 2
    else
        n_way = 1
    endif
1007

1008
    m_way = n_threads / n_way
Andreas Marek's avatar
Andreas Marek committed
1009

1010
1011
    m_id = mod(omp_get_thread_num(),  m_way)
    n_id = omp_get_thread_num() / m_way
Andreas Marek's avatar
Andreas Marek committed
1012

1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
    do ii=n_id*tile_size,(istep*nbw-1),tile_size*n_way
      i = ii / tile_size
      lcs = i*l_cols_tile+1
      lce = min(l_cols,(i+1)*l_cols_tile)
      lre = min(l_rows,(i+1)*l_rows_tile)
      if (lce<lcs .or. lre<1) cycle

      !Figure out this thread's range
      work_per_thread = lre / m_way
      if (work_per_thread * m_way < lre) work_per_thread = work_per_thread + 1
      mystart = m_id * work_per_thread + 1
      myend   = mystart + work_per_thread - 1
      if( myend > lre ) myend = lre
      if( myend-mystart+1 < 1) cycle

      call dgemm('N','T',myend-mystart+1, lce-lcs+1, 2*n_cols, -1.d0, &
                  vmr(mystart, 1), ubound(vmr,1), umc(lcs,1), ubound(umc,1), &
                  1.d0,a(mystart,lcs),ubound(a,1))
    enddo
    !$omp end parallel
1033

Andreas Marek's avatar
Andreas Marek committed
1034
#else /* WITH_OPENMP */
1035
1036
1037
1038
1039
1040
    do i=0,(istep*nbw-1)/tile_size
      lcs = i*l_cols_tile+1
      lce = min(l_cols,(i+1)*l_cols_tile)
      lre = min(l_rows,(i+1)*l_rows_tile)
      if (lce<lcs .or. lre<1) cycle
      call dgemm('N','T',lre,lce-lcs+1,2*n_cols,-1.d0, &
1041
                  vmr,ubound(vmr,dim=1),umc(lcs,1),ubound(umc,dim=1), &
1042
1043
                  1.d0,a(1,lcs),lda)
    enddo
Andreas Marek's avatar
Andreas Marek committed
1044
#endif /* WITH_OPENMP */
1045
    deallocate(vmr, umc, vr)
1046

1047
  enddo
1048

1049
1050
1051
1052
1053
  if (useQR) then
    if (which_qr_decomposition == 1) then
      deallocate(work_blocked)
      deallocate(tauvector)
    endif
Andreas Marek's avatar