elpa2.F90 56.5 KB
Newer Older
1
!   This file is part of ELPA.
2
3
4
5
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
6
7
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8
9
10
11
12
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14
15
16
17
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
18
!    This particular source code file contains additions, changes and
Andreas Marek's avatar
Andreas Marek committed
19
!    enhancements authored by Intel Corporation which is not part of
20
!    the ELPA consortium.
21
22
!
!    More information can be found here:
23
!    http://elpa.mpcdf.mpg.de/
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".


#include "config-f90.h"
64
!> \brief Fortran module which provides the routines to use the two-stage ELPA solver
65
66
67
68
module ELPA2

! Version 1.1.2, 2011-02-21

69
  use elpa_utilities
70
  use elpa1, only : elpa_print_times, time_evp_back, time_evp_fwd, time_evp_solve
71
  use elpa2_utilities
72

73
74
75
76
77
78
  implicit none

  PRIVATE ! By default, all routines contained are private

  ! The following routines are public:

79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
  public :: solve_evp_real_2stage_double
  public :: solve_evp_complex_2stage_double

  interface solve_evp_real_2stage
    module procedure solve_evp_real_2stage_double
  end interface

  interface solve_evp_complex_2stage
    module procedure solve_evp_complex_2stage_double
  end interface

#ifdef WANT_SINGLE_PRECISION_REAL
  public :: solve_evp_real_2stage_single
#endif

#ifdef WANT_SINGLE_PRECISION_COMPLEX
  public :: solve_evp_complex_2stage_single
#endif

98
99
100

!******
contains
101
!-------------------------------------------------------------------------------
102
!>  \brief solve_evp_real_2stage_double: Fortran function to solve the double-precision real eigenvalue problem with a 2 stage approach
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
!>  \param use_qr (optional)                    use QR decomposition
!>
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------
140

141
142
143
144
145
146
147
148
149
150
#define DOUBLE_PRECISION_REAL

#ifdef DOUBLE_PRECISION_REAL
  function solve_evp_real_2stage_double(na, nev, a, lda, ev, q, ldq, nblk,        &
                               matrixCols,                               &
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
                                 useQR) result(success)
#else
  function solve_evp_real_2stage_single(na, nev, a, lda, ev, q, ldq, nblk,        &
151
                               matrixCols,                               &
152
153
154
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
                                 useQR) result(success)
155
#endif
156

157

158
#ifdef HAVE_DETAILED_TIMINGS
159
    use timings
160
#endif
161

162
163
164
   use elpa1_compute
   use elpa2_compute
   use elpa_mpi
165
166
   use cuda_functions
   use mod_check_for_gpu
167
   use iso_c_binding
168
   implicit none
Andreas Marek's avatar
Andreas Marek committed
169
170
   logical, intent(in), optional          :: useQR
   logical                                :: useQRActual, useQREnvironment
171
172
   integer(kind=c_int), intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
   integer(kind=c_int)                       :: THIS_REAL_ELPA_KERNEL
Andreas Marek's avatar
Andreas Marek committed
173

174
   integer(kind=c_int), intent(in)        :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, &
Andreas Marek's avatar
Andreas Marek committed
175
                                             mpi_comm_cols, mpi_comm_all
176
177
   integer(kind=c_int), intent(in)        :: nblk
   real(kind=c_double), intent(inout)     :: ev(na)
178
#ifdef USE_ASSUMED_SIZE
179
   real(kind=c_double), intent(inout)     :: a(lda,*), q(ldq,*)
180
#else
181
   real(kind=c_double), intent(inout)     :: a(lda,matrixCols), q(ldq,matrixCols)
182
#endif
183
   real(kind=c_double), allocatable       :: hh_trans_real(:,:)
Andreas Marek's avatar
Andreas Marek committed
184

185
186
187
   integer(kind=c_int)                    :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer(kind=c_int)                    :: nbw, num_blocks
   real(kind=c_double), allocatable       :: tmat(:,:,:), e(:)
188
   integer(kind=c_intptr_t)               :: tmat_dev, q_dev, a_dev
189
   real(kind=c_double)                    :: ttt0, ttt1, ttts  ! MPI_WTIME always needs double
190
   integer(kind=c_int)                    :: i
Andreas Marek's avatar
Andreas Marek committed
191
192
193
   logical                                :: success
   logical, save                          :: firstCall = .true.
   logical                                :: wantDebug
194
   integer(kind=c_int)                    :: istat
195
196
   character(200)                         :: errorMessage
   logical                                :: useGPU
197
   integer(kind=c_int)                    :: numberOfGPUDevices
Andreas Marek's avatar
Andreas Marek committed
198

199
#ifdef HAVE_DETAILED_TIMINGS
200
    call timer%start("solve_evp_real_2stage_double")
201
#endif
202

203
204
    call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
    call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
205

206
207
208
209
    call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
    call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
    call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
    call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
210

211

212
213
214
215
216
217
    wantDebug = .false.
    if (firstCall) then
      ! are debug messages desired?
      wantDebug = debug_messages_via_environment_variable()
      firstCall = .false.
    endif
218

219
    success = .true.
220

221
222
    useQRActual = .false.
    useGPU      = .false.
223

224
225
226
227
228
    ! set usage of qr decomposition via API call
    if (present(useQR)) then
      if (useQR) useQRActual = .true.
        if (.not.(useQR)) useQRACtual = .false.
    endif
229

230
231
232
233
    ! overwrite this with environment variable settings
    if (qr_decomposition_via_environment_variable(useQREnvironment)) then
      useQRActual = useQREnvironment
    endif
234

235
    if (useQRActual) then
236
      if (mod(na,2) .ne. 0) then
237
238
239
240
241
242
243
244
        if (wantDebug) then
          write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
        endif
        print *, "Do not use QR-decomposition for this matrix and blocksize."
        success = .false.
        return
      endif
    endif
245

246

247
248
249
250
    if (present(THIS_REAL_ELPA_KERNEL_API)) then
      ! user defined kernel via the optional argument in the API call
      THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
    else
251

252
253
254
255
      ! if kernel is not choosen via api
      ! check whether set by environment variable
      THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
    endif
Andreas Marek's avatar
Andreas Marek committed
256

257
    ! check whether choosen kernel is allowed: function returns true if NOT allowed! change this
258
259
260
261
262
263
264
265
266
267
268
269
270
    if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then

      if (my_pe == 0) then
        write(error_unit,*) " "
        write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
        write(error_unit,*) "is not in the list of the allowed kernels!"
        write(error_unit,*) " "
        write(error_unit,*) "Allowed kernels are:"
        do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
          if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
            write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
          endif
        enddo
Andreas Marek's avatar
Andreas Marek committed
271

272
        write(error_unit,*) " "
273
274
275
276
277
278
279
280
281
282
283
        ! check whether generic kernel is defined
         if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
           write(error_unit,*) "The default kernel REAL_ELPA_KERNEL_GENERIC will be used !"
         else
           write(error_unit,*) "As default kernel ",REAL_ELPA_KERNEL_NAMES(DEFAULT_REAL_ELPA_KERNEL)," will be used"
         endif
      endif  ! my_pe == 0
      if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
        THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
      else
        THIS_REAL_ELPA_KERNEL = DEFAULT_REAL_ELPA_KERNEL
284
285
286
287
      endif
    endif

    if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
288
      if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
289
290
291
292
        useGPU = .true.
      endif
      if (nblk .ne. 128) then
        print *,"At the moment GPU version needs blocksize 128"
293
        error stop
294
      endif
295

296
297
298
299
300
301
302
      ! set the neccessary parameters
      cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
      cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
      cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
      cudaHostRegisterPortable = cuda_hostRegisterPortable()
      cudaHostRegisterMapped   = cuda_hostRegisterMapped()
    endif
303

304
    ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
305
306
307
308
    ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
    ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
    ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
    ! on this and maybe allow a run-time optimization here
309
310
311
    if (useGPU) then
      nbw = nblk
    else
312
      nbw = (63/nblk+1)*nblk
313
    endif
314

315
    num_blocks = (na-1)/nbw + 1
316

317
318
319
320
321
    allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
      print *,"solve_evp_real_2stage: error when allocating tmat "//errorMessage
      stop
    endif
322

323
    ! Reduction full -> band
324

325
326
    ttt0 = MPI_Wtime()
    ttts = ttt0
327
#ifdef DOUBLE_PRECISION_REAL
328
329
    call bandred_real_double(na, a, a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
                             tmat, tmat_dev, wantDebug, useGPU, success, useQRActual)
330
#else
331
332
    call bandred_real_single(na, a, a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
                             tmat, tmat_dev, wantDebug, useGPU, success, useQRActual)
333
#endif
334
335
336
337
    if (.not.(success)) return
    ttt1 = MPI_Wtime()
    if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
338

339
     ! Reduction band -> tridiagonal
340

341
342
343
344
345
     allocate(e(na), stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when allocating e "//errorMessage
       stop
     endif
346

347
     ttt0 = MPI_Wtime()
348
349
#ifdef DOUBLE_PRECISION_REAL
     call tridiag_band_real_double(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, &
350
                          mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
351
352
353
354
#else
     call tridiag_band_real_single(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, &
                          mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
#endif
355

356
357
358
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
359

360
#ifdef WITH_MPI
361

362
#ifdef DOUBLE_PRECISION_REAL
363
364
     call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
     call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
365
366
367
368
#else
     call mpi_bcast(ev,na,MPI_REAL4,0,mpi_comm_all,mpierr)
     call mpi_bcast(e,na,MPI_REAL4,0,mpi_comm_all,mpierr)
#endif
369

370
#endif /* WITH_MPI */
371
372
     ttt1 = MPI_Wtime()
     time_evp_fwd = ttt1-ttts
373

374
     ! Solve tridiagonal system
375

376
     ttt0 = MPI_Wtime()
377
378
#ifdef DOUBLE_PRECISION_REAL
     call solve_tridi_double(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
379
                      mpi_comm_cols, wantDebug, success)
380
381
382
383
#else
     call solve_tridi_single(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
                      mpi_comm_cols, wantDebug, success)
#endif
384
385
386
387
     if (.not.(success)) return

     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
388
     write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
389
390
     time_evp_solve = ttt1-ttt0
     ttts = ttt1
391

392
393
394
395
396
397
398
399
     deallocate(e, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when deallocating e "//errorMessage
       stop
     endif
     ! Backtransform stage 1

     ttt0 = MPI_Wtime()
400
#ifdef DOUBLE_PRECISION_REAL
401
     call trans_ev_tridi_to_band_real_double(na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, hh_trans_real, &
402
403
404
                                    mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success,      &
                                    THIS_REAL_ELPA_KERNEL)
#else
405
     call trans_ev_tridi_to_band_real_single(na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, hh_trans_real, &
406
                                    mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success,      &
407
                                    THIS_REAL_ELPA_KERNEL)
408
#endif
409

410
411
412
413
     if (.not.(success)) return
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
414

415
416
417
418
419
420
     ! We can now deallocate the stored householder vectors
     deallocate(hh_trans_real, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when deallocating hh_trans_real "//errorMessage
       stop
     endif
421
422


423
424
425
     ! Backtransform stage 2
     print *,"useGPU== ",useGPU
     ttt0 = MPI_Wtime()
426
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Typo    
Andreas Marek committed
427
428
429
     call trans_ev_band_to_full_real_double(na, nev, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, q_dev, ldq, &
                                            matrixCols, num_blocks, mpi_comm_rows, &
                                            mpi_comm_cols, useGPU, useQRActual)
430
#else
Andreas Marek's avatar
Typo    
Andreas Marek committed
431
432
433
     call trans_ev_band_to_full_real_single(na, nev, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, q_dev, ldq, &
                                            matrixCols, num_blocks, mpi_comm_rows, &
                                            mpi_comm_cols, useGPU, useQRActual)
434
#endif
435

436
437
438
439
440
441
442
443
444
445
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
     time_evp_back = ttt1-ttts

     deallocate(tmat, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when deallocating tmat"//errorMessage
       stop
     endif
446

447
#ifdef HAVE_DETAILED_TIMINGS
448
     call timer%stop("solve_evp_real_2stage_double")
449
#endif
450
1    format(a,f10.3)
451

452
453
454
455
456
#ifdef DOUBLE_PRECISION_REAL
   end function solve_evp_real_2stage_double
#else
   end function solve_evp_real_2stage_single
#endif
457

458
459
460
461
#ifdef WANT_SINGLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
!-------------------------------------------------------------------------------
!>  \brief solve_evp_real_2stage_single: Fortran function to solve the single-precision real eigenvalue problem with a 2 stage approach
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
495
496
!>  \param use_qr (optional)                    use QR decomposition
!>
497
!>  \result success                             logical, false if error occured
498
!-------------------------------------------------------------------------------
499
500
501
502
503
504
505
506
507
508
509
510
511
512

#ifdef DOUBLE_PRECISION_REAL
  function solve_evp_real_2stage_double(na, nev, a, lda, ev, q, ldq, nblk,        &
                               matrixCols,                               &
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
                                 useQR) result(success)
#else
  function solve_evp_real_2stage_single(na, nev, a, lda, ev, q, ldq, nblk,        &
                               matrixCols,                               &
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
                                 useQR) result(success)
#endif
513

514
#ifdef HAVE_DETAILED_TIMINGS
515
    use timings
516
#endif
517

518
519
   use cuda_functions
   use mod_check_for_gpu
520
   use iso_c_binding
521
522
523
   use elpa1_compute
   use elpa2_compute
   use elpa_mpi
524
   implicit none
525
526
527
528
529
530
531
532
533
   logical, intent(in), optional             :: useQR
   logical                                   :: useQRActual, useQREnvironment
   integer(kind=c_int), intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
   integer(kind=c_int)                       :: THIS_REAL_ELPA_KERNEL

   integer(kind=c_int), intent(in)           :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, &
                                                mpi_comm_cols, mpi_comm_all
   integer(kind=c_int), intent(in)           :: nblk
   real(kind=c_float), intent(inout)         :: ev(na)
534
#ifdef USE_ASSUMED_SIZE
535
   real(kind=c_float), intent(inout)         :: a(lda,*),  q(ldq,*)
536
537

#else
538
   real(kind=c_float), intent(inout)         :: a(lda,matrixCols),  q(ldq,matrixCols)
539
#endif
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
   real(kind=c_float), allocatable           :: hh_trans_real(:,:)

   integer(kind=c_int)                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer(kind=c_int)                       :: nbw, num_blocks
   real(kind=c_float), allocatable           :: tmat(:,:,:), e(:)
   integer(kind=c_intptr_t)                  :: tmat_dev, q_dev, a_dev
   real(kind=c_double)                       :: ttt0, ttt1, ttts  ! MPI_WTIME always needs double
   integer(kind=c_int)                       :: i
   logical                                   :: success
   logical, save                             :: firstCall = .true.
   logical                                   :: wantDebug
   integer(kind=c_int)                       :: istat
   character(200)                            :: errorMessage
   logical                                   :: useGPU
   integer(kind=c_int)                       :: numberOfGPUDevices
Andreas Marek's avatar
Andreas Marek committed
555

556
#ifdef HAVE_DETAILED_TIMINGS
557
    call timer%start("solve_evp_real_2stage_single")
558
#endif
559

560
561
562
563
564
565
566
567
568
569
570
571
572
573
    call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
    call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

    call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
    call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
    call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
    call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)

    wantDebug = .false.
    if (firstCall) then
      ! are debug messages desired?
      wantDebug = debug_messages_via_environment_variable()
      firstCall = .false.
    endif
574

575
    success = .true.
576

577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
    useQRActual = .false.
    useGPU      = .false.

    ! set usage of qr decomposition via API call
    if (present(useQR)) then
      if (useQR) useQRActual = .true.
        if (.not.(useQR)) useQRACtual = .false.
    endif

    ! overwrite this with environment variable settings
    if (qr_decomposition_via_environment_variable(useQREnvironment)) then
      useQRActual = useQREnvironment
    endif

    if (useQRActual) then
592
      if (mod(na,2) .ne. 0) then
593
594
595
596
597
598
599
600
601
602
        if (wantDebug) then
          write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
        endif
        print *, "Do not use QR-decomposition for this matrix and blocksize."
        success = .false.
        return
      endif
    endif

    if (present(THIS_REAL_ELPA_KERNEL_API)) then
603
      ! user defined kernel via the optional argument in the API call
604
      THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
605
    else
606

607
608
      ! if kernel is not choosen via api
      ! check whether set by environment variable
609
      THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
610
    endif
611

612
    ! check whether choosen kernel is allowed
613
    if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
614
615
616

      if (my_pe == 0) then
        write(error_unit,*) " "
617
        write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
618
619
620
        write(error_unit,*) "is not in the list of the allowed kernels!"
        write(error_unit,*) " "
        write(error_unit,*) "Allowed kernels are:"
621
622
623
        do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
          if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
            write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
624
625
          endif
        enddo
626

627
        write(error_unit,*) " "
628
629
630
631
632
633
634
635
636
637
638
        ! check whether generic kernel is defined
         if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
           write(error_unit,*) "The default kernel REAL_ELPA_KERNEL_GENERIC will be used !"
         else
           write(error_unit,*) "As default kernel ",REAL_ELPA_KERNEL_NAMES(DEFAULT_REAL_ELPA_KERNEL)," will be used"
         endif
      endif  ! my_pe == 0
      if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
        THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
      else
        THIS_REAL_ELPA_KERNEL = DEFAULT_REAL_ELPA_KERNEL
639
640
      endif
    endif
641

642
643
644
    if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
      if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
        useGPU = .true.
645
646
647
      endif
      if (nblk .ne. 128) then
        print *,"At the moment GPU version needs blocksize 128"
648
        error stop
649
      endif
650
    ! some temporarilly checks until single precision works with all kernels
651

652
653
654
655
656
657
658
      ! set the neccessary parameters
      cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
      cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
      cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
      cudaHostRegisterPortable = cuda_hostRegisterPortable()
      cudaHostRegisterMapped   = cuda_hostRegisterMapped()
    endif
659

660
    ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
661
662
663
664
665
666
667
668
669
    ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
    ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
    ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
    ! on this and maybe allow a run-time optimization here
    if (useGPU) then
      nbw = nblk
    else
      nbw = (63/nblk+1)*nblk
    endif
670

671
672
673
674
    num_blocks = (na-1)/nbw + 1

    allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
675
      print *,"solve_evp_real_2stage: error when allocating tmat "//errorMessage
676
677
      stop
    endif
678

679
    ! Reduction full -> band
680

681
682
    ttt0 = MPI_Wtime()
    ttts = ttt0
683
#ifdef DOUBLE_PRECISION_REAL
684
685
    call bandred_real_double(na, a, a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
                      tmat, tmat_dev, wantDebug, useGPU, success, useQRActual)
686
#else
687
688
    call bandred_real_single(na, a, a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
                      tmat, tmat_dev, wantDebug, useGPU, success, useQRActual)
689
#endif
690
    if (.not.(success)) return
691
692
    ttt1 = MPI_Wtime()
    if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
693
       write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
694

695
     ! Reduction band -> tridiagonal
696

697
698
699
700
701
     allocate(e(na), stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when allocating e "//errorMessage
       stop
     endif
702

703
704
705
706
707
708
709
710
     ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
     call tridiag_band_real_double(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, &
                          mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
#else
     call tridiag_band_real_single(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, &
                          mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
#endif
711

712
713
714
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
715

716
#ifdef WITH_MPI
717

718
719
720
#ifdef DOUBLE_PRECISION_REAL
     call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
     call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
721
#else
722
723
     call mpi_bcast(ev,na,MPI_REAL4,0,mpi_comm_all,mpierr)
     call mpi_bcast(e,na,MPI_REAL4,0,mpi_comm_all,mpierr)
724
#endif
725

726
#endif /* WITH_MPI */
727
728
     ttt1 = MPI_Wtime()
     time_evp_fwd = ttt1-ttts
729

730
     ! Solve tridiagonal system
731

732
733
734
735
736
737
738
739
740
     ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
     call solve_tridi_double(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
                      mpi_comm_cols, wantDebug, success)
#else
     call solve_tridi_single(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
                      mpi_comm_cols, wantDebug, success)
#endif
     if (.not.(success)) return
741

742
743
744
745
746
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
     write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
     time_evp_solve = ttt1-ttt0
     ttts = ttt1
747

748
749
750
751
752
753
     deallocate(e, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when deallocating e "//errorMessage
       stop
     endif
     ! Backtransform stage 1
754

755
756
     ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
757
     call trans_ev_tridi_to_band_real_double(na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, hh_trans_real, &
758
759
760
                                    mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success,      &
                                    THIS_REAL_ELPA_KERNEL)
#else
761
     call trans_ev_tridi_to_band_real_single(na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, hh_trans_real, &
762
763
764
                                    mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success,      &
                                    THIS_REAL_ELPA_KERNEL)
#endif
765

766
767
768
769
     if (.not.(success)) return
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
770

771
772
773
774
775
776
777
778
779
780
781
782
     ! We can now deallocate the stored householder vectors
     deallocate(hh_trans_real, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when deallocating hh_trans_real "//errorMessage
       stop
     endif


     ! Backtransform stage 2
     print *,"useGPU== ",useGPU
     ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Typo    
Andreas Marek committed
783
784
     call trans_ev_band_to_full_real_double(na, nev, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, q_dev, ldq, &
                                            matrixCols, num_blocks, mpi_comm_rows, &
785
                                            mpi_comm_cols, useGPU, useQRActual)
786
#else
Andreas Marek's avatar
Typo    
Andreas Marek committed
787
788
     call trans_ev_band_to_full_real_single(na, nev, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, q_dev, ldq, &
                                            matrixCols, num_blocks, mpi_comm_rows, &
789
                                            mpi_comm_cols, useGPU, useQRActual)
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
#endif

     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
     time_evp_back = ttt1-ttts

     deallocate(tmat, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when deallocating tmat"//errorMessage
       stop
     endif

#ifdef HAVE_DETAILED_TIMINGS
     call timer%stop("solve_evp_real_2stage_single")
#endif
1    format(a,f10.3)

#ifdef DOUBLE_PRECISION_REAL
   end function solve_evp_real_2stage_double
#else
   end function solve_evp_real_2stage_single
#endif

#endif /* WANT_SINGLE_PRECISION_REAL */

   !>  \brief solve_evp_complex_2stage_double: Fortran function to solve the double-precision complex eigenvalue problem with a 2 stage approach
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------
#define DOUBLE_PRECISION_COMPLEX 1

#ifdef DOUBLE_PRECISION_COMPLEX
function solve_evp_complex_2stage_double(na, nev, a, lda, ev, q, ldq, nblk, &
                                  matrixCols, mpi_comm_rows, mpi_comm_cols,      &
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
#else
function solve_evp_complex_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, &
                                  matrixCols, mpi_comm_rows, mpi_comm_cols,      &
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
#endif


#ifdef HAVE_DETAILED_TIMINGS
   use timings
#endif
868
869
870
   use elpa1_compute
   use elpa2_compute
   use elpa_mpi
871
872
873
874
   use cuda_functions
   use mod_check_for_gpu
   use iso_c_binding
   implicit none
875
876
877
878
   integer(kind=c_int), intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
   integer(kind=c_int)                       :: THIS_COMPLEX_ELPA_KERNEL
   integer(kind=c_int), intent(in)           :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
   real(kind=c_double), intent(inout)        :: ev(na)
879
#ifdef USE_ASSUMED_SIZE
880
   complex(kind=c_double), intent(inout)     :: a(lda,*), q(ldq,*)
881
#else
882
   complex(kind=c_double), intent(inout)     :: a(lda,matrixCols), q(ldq,matrixCols)
883
#endif
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
   complex(kind=c_double), allocatable       :: hh_trans_complex(:,:)

   integer(kind=c_int)                       :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes
   integer(kind=c_int)                       :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
   complex(kind=c_double), allocatable       :: tmat(:,:,:)
   real(kind=c_double), allocatable          :: q_real(:,:), e(:)
   real(kind=c_double)                       :: ttt0, ttt1, ttts  ! MPI_WTIME always needs double
   integer(kind=c_int)                       :: i

   logical                                   :: success, wantDebug
   logical, save                             :: firstCall = .true.
   integer(kind=c_int)                       :: istat
   character(200)                            :: errorMessage
   logical                                   :: useGPU
   integer(kind=c_int)                       :: numberOfGPUDevices
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951

#ifdef HAVE_DETAILED_TIMINGS
    call timer%start("solve_evp_complex_2stage_double")
#endif

    call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
    call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

    call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
    call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
    call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
    call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)

    useGPU = .false.
    wantDebug = .false.
    if (firstCall) then
      ! are debug messages desired?
      wantDebug = debug_messages_via_environment_variable()
      firstCall = .false.
    endif


    success = .true.

    if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then
      ! user defined kernel via the optional argument in the API call
      THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API
    else
      ! if kernel is not choosen via api
      ! check whether set by environment variable
      THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel()
    endif

    ! check whether choosen kernel is allowed
    if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then

      if (my_pe == 0) then
        write(error_unit,*) " "
        write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL)
        write(error_unit,*) "is not in the list of the allowed kernels!"
        write(error_unit,*) " "
        write(error_unit,*) "Allowed kernels are:"
        do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
          if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then
            write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
          endif
        enddo

        write(error_unit,*) " "
        write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
      endif
      THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
    endif
952

953
954
955
956
957
958
    if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
      if (check_for_gpu(my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then
        useGPU=.true.
      endif
      if (nblk .ne. 128) then
        print *,"At the moment GPU version needs blocksize 128"
959
        error stop
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
      endif

      ! set the neccessary parameters
      cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
      cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
      cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
      cudaHostRegisterPortable = cuda_hostRegisterPortable()
      cudaHostRegisterMapped   = cuda_hostRegisterMapped()
    endif

    ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

    nbw = (31/nblk+1)*nblk

    num_blocks = (na-1)/nbw + 1

    allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
      print *,"solve_evp_complex_2stage: error when allocating tmat"//errorMessage
      stop
    endif
    ! Reduction full -> band

    ttt0 = MPI_Wtime()
    ttts = ttt0
#ifdef DOUBLE_PRECISION_COMPLEX
    call bandred_complex_double(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
                         tmat, wantDebug, useGPU, success)
#else
    call bandred_complex_single(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
                         tmat, wantDebug, useGPU, success)
#endif
    if (.not.(success)) then

#ifdef HAVE_DETAILED_TIMINGS
      call timer%stop("solve_evp_complex_2stage_double")
#endif
      return
    endif
    ttt1 = MPI_Wtime()
    if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time bandred_complex               :',ttt1-ttt0

    ! Reduction band -> tridiagonal

    allocate(e(na), stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
      print *,"solve_evp_complex_2stage: error when allocating e"//errorMessage
      stop
    endif


    ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_COMPLEX
   call tridiag_band_complex_double(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_complex, &
                             mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
#else
   call tridiag_band_complex_single(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_complex, &
                             mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
#endif

    ttt1 = MPI_Wtime()
    if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time tridiag_band_complex          :',ttt1-ttt0

#ifdef WITH_MPI

#ifdef DOUBLE_PRECISION_COMPLEX
    call mpi_bcast(ev, na, mpi_real8, 0, mpi_comm_all, mpierr)
    call mpi_bcast(e, na, mpi_real8, 0, mpi_comm_all, mpierr)
#else
    call mpi_bcast(ev, na, mpi_real4, 0, mpi_comm_all, mpierr)
    call mpi_bcast(e, na, mpi_real4, 0, mpi_comm_all, mpierr)
#endif

#endif /* WITH_MPI */
    ttt1 = MPI_Wtime()
    time_evp_fwd = ttt1-ttts

    l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
    l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
    l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev

    allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
      print *,"solve_evp_complex_2stage: error when allocating q_real"//errorMessage
      stop
    endif

    ! Solve tridiagonal system

    ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_COMPLEX
    call solve_tridi_double(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, &
                     mpi_comm_rows, mpi_comm_cols, wantDebug, success)
#else
    call solve_tridi_single(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, &
                     mpi_comm_rows, mpi_comm_cols, wantDebug, success)
#endif
    if (.not.(success)) return

    ttt1 = MPI_Wtime()
    if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times)  &
       write(error_unit,*) 'Time solve_tridi                   :',ttt1-ttt0
    time_evp_solve = ttt1-ttt0
    ttts = ttt1

    q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)

    deallocate(e, q_real, stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
      print *,"solve_evp_complex_2stage: error when deallocating e, q_real"//errorMessage
      stop
    endif


    ! Backtransform stage 1

    ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_COMPLEX
    call trans_ev_tridi_to_band_complex_double(na, nev, nblk, nbw, q, ldq,  &
                                       matrixCols, hh_trans_complex, &
                                       mpi_comm_rows, mpi_comm_cols, &
                                       wantDebug, useGPU, success,THIS_COMPLEX_ELPA_KERNEL)
#else
    call trans_ev_tridi_to_band_complex_single(na, nev, nblk, nbw, q, ldq,  &
                                       matrixCols, hh_trans_complex, &
                                       mpi_comm_rows, mpi_comm_cols, &
                                       wantDebug, useGPU, success,THIS_COMPLEX_ELPA_KERNEL)
#endif
    if (.not.(success)) return
    ttt1 = MPI_Wtime()
    if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0

    ! We can now deallocate the stored householder vectors
    deallocate(hh_trans_complex, stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
      print *,"solve_evp_complex_2stage: error when deallocating hh_trans_complex"//errorMessage
      stop
    endif

    ! Backtransform stage 2

    ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_COMPLEX
   call trans_ev_band_to_full_complex_double(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, &
                                      mpi_comm_rows, mpi_comm_cols, useGPU)
#else
   call trans_ev_band_to_full_complex_single(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, &
                                      mpi_comm_rows, mpi_comm_cols, useGPU)
#endif
    ttt1 = MPI_Wtime()
    if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0
    time_evp_back = ttt1-ttts

    deallocate(tmat, stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
      print *,"solve_evp_complex_2stage: error when deallocating tmat "//errorMessage
      stop
    endif

#ifdef HAVE_DETAILED_TIMINGS
    call timer%stop("solve_evp_complex_2stage_double")
#endif

1   format(a,f10.3)
#ifdef DOUBLE_PRECISION_COMPLEX
end function solve_evp_complex_2stage_double
#else
end function solve_evp_complex_2stage_single
#endif

#ifdef WANT_SINGLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_COMPLEX

!>  \brief solve_evp_complex_2stage_single: Fortran function to solve the single-precision complex eigenvalue problem with a 2 stage approach
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------

#ifdef DOUBLE_PRECISION_COMPLEX
function solve_evp_complex_2stage_double(na, nev, a, lda, ev, q, ldq, nblk, &
                                  matrixCols, mpi_comm_rows, mpi_comm_cols,      &
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
#else
function solve_evp_complex_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, &
                                  matrixCols, mpi_comm_rows, mpi_comm_cols,      &
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
#endif


#ifdef HAVE_DETAILED_TIMINGS
   use timings
#endif
   use cuda_functions
   use mod_check_for_gpu
1190
1191
1192
   use elpa1_compute
   use elpa2_compute
   use elpa_mpi
1193
1194
   use iso_c_binding
   implicit none
1195
1196
1197
   integer(kind=c_int), intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
   integer(kind=c_int)                       :: THIS_COMPLEX_ELPA_KERNEL
   integer(kind=c_int), intent(in)           :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
1198
#ifdef USE_ASSUMED_SIZE
1199
   complex(kind=c_float), intent(inout)      :: a(lda,*), q(ldq,*)
1200
#else
1201
   complex(kind=c_float), intent(inout)      :: a(lda,matrixCols), q(ldq,matrixCols)
1202
#endif
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
   real(kind=c_float), intent(inout)         :: ev(na)
   complex(kind=c_float), allocatable        :: hh_trans_complex(:,:)

   integer(kind=c_int)                       :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes
   integer(kind=c_int)                       :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
   complex(kind=c_float), allocatable        :: tmat(:,:,:)
   real(kind=c_float), allocatable           :: q_real(:,:), e(:)
   real(kind=c_double)                       :: ttt0, ttt1, ttts  ! MPI_WTIME always needs double
   integer(kind=c_int)                       :: i

   logical                                   :: success, wantDebug
   logical, save                             :: firstCall = .true.
   integer(kind=c_int)                       :: istat
   character(200)                            :: errorMessage
   logical                                   :: useGPU
   integer(kind=c_int)                       :: numberOfGPUDevices
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271

#ifdef HAVE_DETAILED_TIMINGS
    call timer%start("solve_evp_complex_2stage_single")
#endif

    call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
    call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

    call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
    call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
    call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
    call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)

    useGPU = .false.
    wantDebug = .false.
    if (firstCall) then
      ! are debug messages desired?
      wantDebug = debug_messages_via_environment_variable()
      firstCall = .false.
    endif


    success = .true.

    if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then
      ! user defined kernel via the optional argument in the API call
      THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API
    else
      ! if kernel is not choosen via api
      ! check whether set by environment variable
      THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel()
    endif

    ! check whether choosen kernel is allowed
    if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then

      if (my_pe == 0) then
        write(error_unit,*) " "
        write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL)
        write(error_unit,*) "is not in the list of the allowed kernels!"
        write(error_unit,*) " "
        write(error_unit,*) "Allowed kernels are:"
        do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
          if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then
            write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
          endif
        enddo

        write(error_unit,*) " "
        write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
      endif
      THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
    endif
1272

1273
1274
1275
1276
1277
1278
    if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
      if (check_for_gpu(my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then
        useGPU=.true.
      endif
      if (nblk .ne. 128) then
        print *,"At the moment GPU version needs blocksize 128"
1279
        error stop
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329