elpa2.F90 21.5 KB
Newer Older
1
2
3
4
5
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
6
7
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8
9
10
11
12
13
14
15
16
17
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
18
!    This particular source code file contains additions, changes and
Andreas Marek's avatar
Andreas Marek committed
19
!    enhancements authored by Intel Corporation which is not part of
20
!    the ELPA consortium.
21
22
!
!    More information can be found here:
23
!    http://elpa.mpcdf.mpg.de/
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".


#include "config-f90.h"
64
!> \brief Fortran module which provides the routines to use the two-stage ELPA solver
65
66
67
68
module ELPA2

! Version 1.1.2, 2011-02-21

69
  use elpa_utilities
70
  use elpa1, only : elpa_print_times, time_evp_back, time_evp_fwd, time_evp_solve
71
  use elpa2_utilities
72

73
74
75
76
77
78
79
80
81
82
83
84
  implicit none

  PRIVATE ! By default, all routines contained are private

  ! The following routines are public:

  public :: solve_evp_real_2stage
  public :: solve_evp_complex_2stage


!******
contains
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
!-------------------------------------------------------------------------------
!>  \brief solve_evp_real_2stage: Fortran function to solve the real eigenvalue problem with a 2 stage approach
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
!>  \param use_qr (optional)                    use QR decomposition
!>
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------
124

125
function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk,        &
126
                               matrixCols,                               &
127
128
129
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
                                 useQR) result(success)
130

131
#ifdef HAVE_DETAILED_TIMINGS
Andreas Marek's avatar
Andreas Marek committed
132
   use timings
133
#endif
134
135
136
   use elpa1_compute
   use elpa2_compute
   use elpa_mpi
Andreas Marek's avatar
Andreas Marek committed
137
   use precision
138
   implicit none
Andreas Marek's avatar
Andreas Marek committed
139
140
141
142
143
144
145
146
147
   logical, intent(in), optional          :: useQR
   logical                                :: useQRActual, useQREnvironment
   integer(kind=ik), intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
   integer(kind=ik)                       :: THIS_REAL_ELPA_KERNEL

   integer(kind=ik), intent(in)           :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, &
                                             mpi_comm_cols, mpi_comm_all
   integer(kind=ik), intent(in)           :: nblk
   real(kind=rk), intent(inout)           :: a(lda,matrixCols), ev(na), q(ldq,matrixCols)
148
149
   ! was
   ! real a(lda,*), q(ldq,*)
Andreas Marek's avatar
Andreas Marek committed
150
151
152
153
154
155
156
157
158
159
   real(kind=rk), allocatable             :: hh_trans_real(:,:)

   integer(kind=ik)                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer(kind=ik)                       :: nbw, num_blocks
   real(kind=rk), allocatable             :: tmat(:,:,:), e(:)
   real(kind=rk)                          :: ttt0, ttt1, ttts
   integer(kind=ik)                       :: i
   logical                                :: success
   logical, save                          :: firstCall = .true.
   logical                                :: wantDebug
160

161
162
163
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_real_2stage")
#endif
164
165
166
167
168
169
170
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
171

172
173
174
175
176
177
178
   wantDebug = .false.
   if (firstCall) then
     ! are debug messages desired?
     wantDebug = debug_messages_via_environment_variable()
     firstCall = .false.
   endif

179
180
   success = .true.

181
182
183
184
185
186
187
188
189
190
191
192
193
   useQRActual = .false.

   ! set usage of qr decomposition via API call
   if (present(useQR)) then
     if (useQR) useQRActual = .true.
     if (.not.(useQR)) useQRACtual = .false.
   endif

   ! overwrite this with environment variable settings
   if (qr_decomposition_via_environment_variable(useQREnvironment)) then
     useQRActual = useQREnvironment
   endif

194
   if (useQRActual) then
195
     if (mod(na,2) .ne. 0) then
196
197
198
       if (wantDebug) then
         write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
       endif
Andreas Marek's avatar
Andreas Marek committed
199
     print *, "Do not use QR-decomposition for this matrix and blocksize."
Andreas Marek's avatar
Andreas Marek committed
200
201
     success = .false.
     return
202
     endif
203
204
   endif

205

206
207
208
   if (present(THIS_REAL_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
209
   else
210

211
212
213
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
Andreas Marek's avatar
Andreas Marek committed
214
215
   endif

216
   ! check whether choosen kernel is allowed: function returns true if NOT allowed! change this
Andreas Marek's avatar
Andreas Marek committed
217
   if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
218

219
220
221
222
223
224
225
226
227
228
229
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
230

231
       write(error_unit,*) " "
232
233
234
235
236
237
238
239
240
241
242
       ! check whether generic kernel is defined
       if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
         write(error_unit,*) "The default kernel REAL_ELPA_KERNEL_GENERIC will be used !"
       else
         write(error_unit,*) "As default kernel ",REAL_ELPA_KERNEL_NAMES(DEFAULT_REAL_ELPA_KERNEL)," will be used"
       endif
     endif  ! my_pe == 0
     if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
       THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
     else
       THIS_REAL_ELPA_KERNEL = DEFAULT_REAL_ELPA_KERNEL
243
     endif
Andreas Marek's avatar
Andreas Marek committed
244
   endif
245
246

   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
Andreas Marek's avatar
Andreas Marek committed
247
   ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
248
   ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
Andreas Marek's avatar
Andreas Marek committed
249
250
   ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
   ! on this and maybe allow a run-time optimization here
251
   nbw = (63/nblk+1)*nblk
252
253
254
255
256
257
258
259
260

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
261
   call bandred_real(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
262
                     tmat, wantDebug, success, useQRActual)
263
   if (.not.(success)) return
264
   ttt1 = MPI_Wtime()
265
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
266
      write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
267
268
269
270
271
272

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
273
274
   call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, &
                          mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
275
   ttt1 = MPI_Wtime()
276
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
277
      write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
278
#ifdef WITH_MPI
279
280
   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
281
#endif
282
283
284
285
286
287
   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
288
   call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
289
                    mpi_comm_cols, wantDebug, success)
290
291
   if (.not.(success)) return

292
   ttt1 = MPI_Wtime()
293
294
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
     write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
295
296
297
298
299
300
301
302
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   deallocate(e)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
303
304
305
   call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, &
                                    mpi_comm_rows, mpi_comm_cols, wantDebug, success,      &
                                    THIS_REAL_ELPA_KERNEL)
306
   if (.not.(success)) return
307
   ttt1 = MPI_Wtime()
308
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
309
      write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
310
311
312
313
314
315
316

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_real)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
317
   call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, &
318
                                   mpi_comm_cols, useQRActual)
319
   ttt1 = MPI_Wtime()
320
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
321
      write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
322
323
324
   time_evp_back = ttt1-ttts

   deallocate(tmat)
325
326
327
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_real_2stage")
#endif
328
329
1  format(a,f10.3)

330
end function solve_evp_real_2stage
331
332
333


!-------------------------------------------------------------------------------
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
!>  \brief solve_evp_complex_2stage: Fortran function to solve the complex eigenvalue problem with a 2 stage approach
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------
370
function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
371
                                  matrixCols, mpi_comm_rows, mpi_comm_cols,      &
372
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
373

374
#ifdef HAVE_DETAILED_TIMINGS
Andreas Marek's avatar
Andreas Marek committed
375
   use timings
376
#endif
377
378
379
   use elpa1_compute
   use elpa2_compute
   use elpa_mpi
Andreas Marek's avatar
Andreas Marek committed
380
   use precision
381
   implicit none
Andreas Marek's avatar
Andreas Marek committed
382
383
384
385
   integer(kind=ik), intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
   integer(kind=ik)                       :: THIS_COMPLEX_ELPA_KERNEL
   integer(kind=ik), intent(in)           :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
   complex(kind=ck), intent(inout)        :: a(lda,matrixCols), q(ldq,matrixCols)
386
387
   ! was
   ! complex a(lda,*), q(ldq,*)
Andreas Marek's avatar
Andreas Marek committed
388
389
390
391
392
393
394
395
396
397
398
399
   real(kind=rk), intent(inout)           :: ev(na)
   complex(kind=ck), allocatable          :: hh_trans_complex(:,:)

   integer(kind=ik)                       :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes
   integer(kind=ik)                       :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
   complex(kind=ck), allocatable          :: tmat(:,:,:)
   real(kind=rk), allocatable             :: q_real(:,:), e(:)
   real(kind=rk)                          :: ttt0, ttt1, ttts
   integer(kind=ik)                       :: i

   logical                                :: success, wantDebug
   logical, save                          :: firstCall = .true.
400

401
402
403
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_complex_2stage")
#endif
Andreas Marek's avatar
Andreas Marek committed
404
405
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
406
407
408
409
410

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
411
412
413
414
415
416
417
418
   wantDebug = .false.
   if (firstCall) then
     ! are debug messages desired?
     wantDebug = debug_messages_via_environment_variable()
     firstCall = .false.
   endif


419
420
   success = .true.

421
422
423
   if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
424
   else
425
426
427
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel()
Andreas Marek's avatar
Andreas Marek committed
428
   endif
429

Andreas Marek's avatar
Andreas Marek committed
430
431
   ! check whether choosen kernel is allowed
   if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then
432

433
434
435
436
437
438
439
440
441
442
443
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
444

445
       write(error_unit,*) " "
446
447
448
449
450
451
452
453
454
455
456
       ! check whether generic kernel is defined
       if (AVAILABLE_COMPLEX_ELPA_KERNELS(COMPLEX_ELPA_KERNEL_GENERIC) .eq. 1) then
         write(error_unit,*) "The default kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
       else
         write(error_unit,*) "As default kernel ",COMPLEX_ELPA_KERNEL_NAMES(DEFAULT_COMPLEX_ELPA_KERNEL)," will be used"
       endif
     endif  ! my_pe == 0
     if (AVAILABLE_COMPLEX_ELPA_KERNELS(COMPLEX_ELPA_KERNEL_GENERIC) .eq. 1) then
       THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
     else
       THIS_COMPLEX_ELPA_KERNEL = DEFAULT_COMPLEX_ELPA_KERNEL
457
     endif
Andreas Marek's avatar
Andreas Marek committed
458
   endif
459
460
461
462
463
464
465
466
467
468
469
470
   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

   nbw = (31/nblk+1)*nblk

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
471
   call bandred_complex(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
472
                        tmat, wantDebug, success)
473
474
475
476
477
478
   if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
     call timer%stop()
#endif
     return
   endif
479
   ttt1 = MPI_Wtime()
480
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
481
      write(error_unit,*) 'Time bandred_complex               :',ttt1-ttt0
482
483
484
485
486
487

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
488
489
   call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_complex, &
                             mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
490
   ttt1 = MPI_Wtime()
491
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
492
      write(error_unit,*) 'Time tridiag_band_complex          :',ttt1-ttt0
493
#ifdef WITH_MPI
494
495
   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
496
#endif
497
498
499
500
501
502
503
504
505
506
507
508
   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
   l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev

   allocate(q_real(l_rows,l_cols))

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
509
   call solve_tridi(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, &
510
                    mpi_comm_rows, mpi_comm_cols, wantDebug, success)
511
512
   if (.not.(success)) return

513
   ttt1 = MPI_Wtime()
514
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times)  &
515
      write(error_unit,*) 'Time solve_tridi                   :',ttt1-ttt0
516
517
518
519
520
521
522
523
524
525
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)

   deallocate(e, q_real)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
526
527
528
   call trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq,   &
                                       matrixCols, hh_trans_complex, &
                                       mpi_comm_rows, mpi_comm_cols, &
529
                                       wantDebug, success,THIS_COMPLEX_ELPA_KERNEL)
530
   if (.not.(success)) return
531
   ttt1 = MPI_Wtime()
532
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
533
      write(error_unit,*) 'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0
534
535
536
537
538
539
540

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_complex)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
541
542
   call trans_ev_band_to_full_complex(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, &
                                      mpi_comm_rows, mpi_comm_cols)
543
   ttt1 = MPI_Wtime()
544
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
545
      write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0
546
547
548
   time_evp_back = ttt1-ttts

   deallocate(tmat)
549
550
551
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_complex_2stage")
#endif
552
553
554

1  format(a,f10.3)

555
end function solve_evp_complex_2stage
556
557

end module ELPA2