elpa2.F90 71.6 KB
Newer Older
1
!   This file is part of ELPA.
2
3
4
5
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
6
7
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8
9
10
11
12
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14
15
16
17
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
18
!    This particular source code file contains additions, changes and
Andreas Marek's avatar
Andreas Marek committed
19
!    enhancements authored by Intel Corporation which is not part of
20
!    the ELPA consortium.
21
22
!
!    More information can be found here:
23
!    http://elpa.mpcdf.mpg.de/
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".


#include "config-f90.h"
64
!> \brief Fortran module which provides the routines to use the 2-stage ELPA solver
65
66
67
68
module ELPA2

! Version 1.1.2, 2011-02-21

69
  use elpa_utilities
70
  use elpa1, only : elpa_print_times, time_evp_back, time_evp_fwd, time_evp_solve
71
  use elpa2_utilities
72

73
74
75
76
77
78
  implicit none

  PRIVATE ! By default, all routines contained are private

  ! The following routines are public:

79
80
81
82
  public :: solve_evp_real_2stage_double               !< old, deprecated interface: Driver routine for real double-precision eigenvalue problem. will be deleted at some point
  public :: solve_evp_complex_2stage_double            !< old, deprecated interface: Driver routine for complex double-precision eigenvalue problem. will be deleted at some point
  public :: elpa_solve_evp_real_2stage_double          !< Driver routine for real double-precision 2-stage eigenvalue problem
  public :: elpa_solve_evp_complex_2stage_double       !< Driver routine for complex double-precision 2-stage eigenvalue problem
83

84

85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
!-------------------------------------------------------------------------------
!>  \brief solve_evp_real_2stage: Old, deprecated interface for elpa_solve_evp_real_2stage_double
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
!>  \param use_qr (optional)                    use QR decomposition
!>
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------
124
125
126
127
  interface solve_evp_real_2stage
    module procedure solve_evp_real_2stage_double
  end interface

128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
!-------------------------------------------------------------------------------
!>  \brief elpa_solve_evp_real_2stage_double: Fortran function to solve the real double-precision eigenvalue problem with a 2 stage approach. This is called by "elpa_solve_evp_real_double"
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
!>  \param use_qr (optional)                    use QR decomposition
!>
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------
167
168
169
170
  interface elpa_solve_evp_real_2stage_double
    module procedure solve_evp_real_2stage_double
  end interface

171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
!-------------------------------------------------------------------------------
!>  \brief solve_evp_complex_2stage: Old, deprecated interface for elpa_solve_evp_complex_2stage_double
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------
208
209
210
211
  interface solve_evp_complex_2stage
    module procedure solve_evp_complex_2stage_double
  end interface

212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
!-------------------------------------------------------------------------------
!>  \brief elpa_solve_evp_complex_2stage_double: Fortran function to solve the complex double-precision eigenvalue problem with a 2 stage approach. This is called by "elpa_solve_evp_complex_double"
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------
249
250
251
252
  interface elpa_solve_evp_complex_2stage_double
    module procedure solve_evp_complex_2stage_double
  end interface

253
254
#ifdef WANT_SINGLE_PRECISION_REAL
  public :: solve_evp_real_2stage_single
255
  public :: elpa_solve_evp_real_2stage_single
256
257
258
259
#endif

#ifdef WANT_SINGLE_PRECISION_COMPLEX
  public :: solve_evp_complex_2stage_single
260
  public :: elpa_solve_evp_complex_2stage_single
261
262
#endif

263
#ifdef WANT_SINGLE_PRECISION_REAL
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
!-------------------------------------------------------------------------------
!>  \brief elpa_solve_evp_real_2stage_single: Fortran function to solve the real single-precision eigenvalue problem with a 2 stage approach. This is called by "elpa_solve_evp_real_single"
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
!>  \param use_qr (optional)                    use QR decomposition
!>
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------
303
304
305
306
307
308
  interface elpa_solve_evp_real_2stage_single
    module procedure solve_evp_real_2stage_single
  end interface
#endif

#ifdef WANT_SINGLE_PRECISION_COMPLEX
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
!-------------------------------------------------------------------------------
!>  \brief elpa_solve_evp_complex_2stage_single: Fortran function to solve the complex double-precision eigenvalue problem with a 2 stage approach. This is called by "elpa_solve_evp_complex_single"
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------
346
  interface elpa_solve_evp_complex_2stage_single
347
348
349
    module procedure solve_evp_complex_2stage_single
  end interface
#endif
350

351
  contains
352
!-------------------------------------------------------------------------------
353
!>  \brief solve_evp_real_2stage_double: Fortran function to solve the double-precision real eigenvalue problem with a 2 stage approach
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
!>  \param use_qr (optional)                    use QR decomposition
!>
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------
391

392
393
394
395
396
397
398
399
400
401
#define DOUBLE_PRECISION_REAL

#ifdef DOUBLE_PRECISION_REAL
  function solve_evp_real_2stage_double(na, nev, a, lda, ev, q, ldq, nblk,        &
                               matrixCols,                               &
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
                                 useQR) result(success)
#else
  function solve_evp_real_2stage_single(na, nev, a, lda, ev, q, ldq, nblk,        &
402
                               matrixCols,                               &
403
404
405
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
                                 useQR) result(success)
406
#endif
407

408

409
#ifdef HAVE_DETAILED_TIMINGS
410
    use timings
411
#endif
412

413
414
415
   use elpa1_compute
   use elpa2_compute
   use elpa_mpi
416
417
   use cuda_functions
   use mod_check_for_gpu
418
   use iso_c_binding
419
   implicit none
Andreas Marek's avatar
Andreas Marek committed
420
421
   logical, intent(in), optional          :: useQR
   logical                                :: useQRActual, useQREnvironment
422
423
   integer(kind=c_int), intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
   integer(kind=c_int)                       :: THIS_REAL_ELPA_KERNEL
Andreas Marek's avatar
Andreas Marek committed
424

425
   integer(kind=c_int), intent(in)        :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, &
Andreas Marek's avatar
Andreas Marek committed
426
                                             mpi_comm_cols, mpi_comm_all
427
428
   integer(kind=c_int), intent(in)        :: nblk
   real(kind=c_double), intent(inout)     :: ev(na)
429
#ifdef USE_ASSUMED_SIZE
430
   real(kind=c_double), intent(inout)     :: a(lda,*), q(ldq,*)
431
#else
432
   real(kind=c_double), intent(inout)     :: a(lda,matrixCols), q(ldq,matrixCols)
433
#endif
434
   real(kind=c_double), allocatable       :: hh_trans_real(:,:)
Andreas Marek's avatar
Andreas Marek committed
435

436
437
438
   integer(kind=c_int)                    :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer(kind=c_int)                    :: nbw, num_blocks
   real(kind=c_double), allocatable       :: tmat(:,:,:), e(:)
439
   integer(kind=c_intptr_t)               :: tmat_dev, q_dev, a_dev
440
   real(kind=c_double)                    :: ttt0, ttt1, ttts  ! MPI_WTIME always needs double
441
   integer(kind=c_int)                    :: i
Andreas Marek's avatar
Andreas Marek committed
442
443
444
   logical                                :: success
   logical, save                          :: firstCall = .true.
   logical                                :: wantDebug
445
   integer(kind=c_int)                    :: istat
446
447
   character(200)                         :: errorMessage
   logical                                :: useGPU
448
   integer(kind=c_int)                    :: numberOfGPUDevices
Andreas Marek's avatar
Andreas Marek committed
449

450
#ifdef HAVE_DETAILED_TIMINGS
451
    call timer%start("solve_evp_real_2stage_double")
452
#endif
453

454
455
456
#ifdef HAVE_DETAILED_TIMINGS
    call timer%start("mpi_communication")
#endif
457
458
    call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
    call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
459

460
461
462
463
    call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
    call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
    call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
    call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
464
465
466
#ifdef HAVE_DETAILED_TIMINGS
    call timer%stop("mpi_communication")
#endif
467

468
469
470
471
472
473
    wantDebug = .false.
    if (firstCall) then
      ! are debug messages desired?
      wantDebug = debug_messages_via_environment_variable()
      firstCall = .false.
    endif
474

475
    success = .true.
476

477
478
    useQRActual = .false.
    useGPU      = .false.
479

480
481
482
483
484
    ! set usage of qr decomposition via API call
    if (present(useQR)) then
      if (useQR) useQRActual = .true.
        if (.not.(useQR)) useQRACtual = .false.
    endif
485

486
487
488
489
    ! overwrite this with environment variable settings
    if (qr_decomposition_via_environment_variable(useQREnvironment)) then
      useQRActual = useQREnvironment
    endif
490

491
    if (useQRActual) then
492
      if (mod(na,2) .ne. 0) then
493
494
495
496
497
498
499
500
        if (wantDebug) then
          write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
        endif
        print *, "Do not use QR-decomposition for this matrix and blocksize."
        success = .false.
        return
      endif
    endif
501

502

503
504
505
506
    if (present(THIS_REAL_ELPA_KERNEL_API)) then
      ! user defined kernel via the optional argument in the API call
      THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
    else
507

508
509
510
511
      ! if kernel is not choosen via api
      ! check whether set by environment variable
      THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
    endif
Andreas Marek's avatar
Andreas Marek committed
512

513
    ! check whether choosen kernel is allowed: function returns true if NOT allowed! change this
514
515
516
517
518
519
520
521
522
523
524
525
526
    if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then

      if (my_pe == 0) then
        write(error_unit,*) " "
        write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
        write(error_unit,*) "is not in the list of the allowed kernels!"
        write(error_unit,*) " "
        write(error_unit,*) "Allowed kernels are:"
        do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
          if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
            write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
          endif
        enddo
Andreas Marek's avatar
Andreas Marek committed
527

528
        write(error_unit,*) " "
529
530
531
532
533
534
535
536
537
538
539
        ! check whether generic kernel is defined
         if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
           write(error_unit,*) "The default kernel REAL_ELPA_KERNEL_GENERIC will be used !"
         else
           write(error_unit,*) "As default kernel ",REAL_ELPA_KERNEL_NAMES(DEFAULT_REAL_ELPA_KERNEL)," will be used"
         endif
      endif  ! my_pe == 0
      if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
        THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
      else
        THIS_REAL_ELPA_KERNEL = DEFAULT_REAL_ELPA_KERNEL
540
541
542
543
      endif
    endif

    if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
544
      if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
545
546
547
548
        useGPU = .true.
      endif
      if (nblk .ne. 128) then
        print *,"At the moment GPU version needs blocksize 128"
549
        error stop
550
      endif
551

552
553
554
555
556
557
558
      ! set the neccessary parameters
      cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
      cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
      cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
      cudaHostRegisterPortable = cuda_hostRegisterPortable()
      cudaHostRegisterMapped   = cuda_hostRegisterMapped()
    endif
559

560
    ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
561
562
563
564
    ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
    ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
    ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
    ! on this and maybe allow a run-time optimization here
565
566
567
    if (useGPU) then
      nbw = nblk
    else
568
      nbw = (63/nblk+1)*nblk
569
    endif
570

571
    num_blocks = (na-1)/nbw + 1
572

573
574
575
576
577
    allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
      print *,"solve_evp_real_2stage: error when allocating tmat "//errorMessage
      stop
    endif
578

579
    ! Reduction full -> band
580

581
582
    ttt0 = MPI_Wtime()
    ttts = ttt0
583
#ifdef DOUBLE_PRECISION_REAL
584
585
    call bandred_real_double(na, a, a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
                             tmat, tmat_dev, wantDebug, useGPU, success, useQRActual)
586
#else
587
588
    call bandred_real_single(na, a, a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
                             tmat, tmat_dev, wantDebug, useGPU, success, useQRActual)
589
#endif
590
591
592
593
    if (.not.(success)) return
    ttt1 = MPI_Wtime()
    if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
594

595
     ! Reduction band -> tridiagonal
596

597
598
599
600
601
     allocate(e(na), stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when allocating e "//errorMessage
       stop
     endif
602

603
     ttt0 = MPI_Wtime()
604
605
#ifdef DOUBLE_PRECISION_REAL
     call tridiag_band_real_double(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, &
606
                          mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
607
608
609
610
#else
     call tridiag_band_real_single(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, &
                          mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
#endif
611

612
613
614
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
615

616
#ifdef WITH_MPI
617
618
619
#ifdef HAVE_DETAILED_TIMINGS
     call timer%start("mpi_communication")
#endif
620
#ifdef DOUBLE_PRECISION_REAL
621
622
     call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
     call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
623
624
625
626
#else
     call mpi_bcast(ev,na,MPI_REAL4,0,mpi_comm_all,mpierr)
     call mpi_bcast(e,na,MPI_REAL4,0,mpi_comm_all,mpierr)
#endif
627
628
629
#ifdef HAVE_DETAILED_TIMINGS
     call timer%stop("mpi_communication")
#endif
630
#endif /* WITH_MPI */
631
632
     ttt1 = MPI_Wtime()
     time_evp_fwd = ttt1-ttts
633

634
     ! Solve tridiagonal system
635

636
     ttt0 = MPI_Wtime()
637
638
#ifdef DOUBLE_PRECISION_REAL
     call solve_tridi_double(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
639
                      mpi_comm_cols, wantDebug, success)
640
641
642
643
#else
     call solve_tridi_single(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
                      mpi_comm_cols, wantDebug, success)
#endif
644
645
646
647
     if (.not.(success)) return

     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
648
     write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
649
650
     time_evp_solve = ttt1-ttt0
     ttts = ttt1
651

652
653
654
655
656
657
658
659
     deallocate(e, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when deallocating e "//errorMessage
       stop
     endif
     ! Backtransform stage 1

     ttt0 = MPI_Wtime()
660
#ifdef DOUBLE_PRECISION_REAL
661
     call trans_ev_tridi_to_band_real_double(na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, hh_trans_real, &
662
663
664
                                    mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success,      &
                                    THIS_REAL_ELPA_KERNEL)
#else
665
     call trans_ev_tridi_to_band_real_single(na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, hh_trans_real, &
666
                                    mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success,      &
667
                                    THIS_REAL_ELPA_KERNEL)
668
#endif
669

670
671
672
673
     if (.not.(success)) return
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
674

675
676
677
678
679
680
     ! We can now deallocate the stored householder vectors
     deallocate(hh_trans_real, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when deallocating hh_trans_real "//errorMessage
       stop
     endif
681
682


683
684
685
     ! Backtransform stage 2
     print *,"useGPU== ",useGPU
     ttt0 = MPI_Wtime()
686
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Typo    
Andreas Marek committed
687
688
689
     call trans_ev_band_to_full_real_double(na, nev, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, q_dev, ldq, &
                                            matrixCols, num_blocks, mpi_comm_rows, &
                                            mpi_comm_cols, useGPU, useQRActual)
690
#else
Andreas Marek's avatar
Typo    
Andreas Marek committed
691
692
693
     call trans_ev_band_to_full_real_single(na, nev, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, q_dev, ldq, &
                                            matrixCols, num_blocks, mpi_comm_rows, &
                                            mpi_comm_cols, useGPU, useQRActual)
694
#endif
695

696
697
698
699
700
701
702
703
704
705
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
     time_evp_back = ttt1-ttts

     deallocate(tmat, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when deallocating tmat"//errorMessage
       stop
     endif
706

707
#ifdef HAVE_DETAILED_TIMINGS
708
     call timer%stop("solve_evp_real_2stage_double")
709
#endif
710
1    format(a,f10.3)
711

712
713
714
715
716
#ifdef DOUBLE_PRECISION_REAL
   end function solve_evp_real_2stage_double
#else
   end function solve_evp_real_2stage_single
#endif
717

718
719
720
721
#ifdef WANT_SINGLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
!-------------------------------------------------------------------------------
!>  \brief solve_evp_real_2stage_single: Fortran function to solve the single-precision real eigenvalue problem with a 2 stage approach
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
755
756
!>  \param use_qr (optional)                    use QR decomposition
!>
757
!>  \result success                             logical, false if error occured
758
!-------------------------------------------------------------------------------
759
760
761
762
763
764
765
766
767
768
769
770
771
772

#ifdef DOUBLE_PRECISION_REAL
  function solve_evp_real_2stage_double(na, nev, a, lda, ev, q, ldq, nblk,        &
                               matrixCols,                               &
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
                                 useQR) result(success)
#else
  function solve_evp_real_2stage_single(na, nev, a, lda, ev, q, ldq, nblk,        &
                               matrixCols,                               &
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
                                 useQR) result(success)
#endif
773

774
#ifdef HAVE_DETAILED_TIMINGS
775
    use timings
776
#endif
777

778
779
   use cuda_functions
   use mod_check_for_gpu
780
   use iso_c_binding
781
782
783
   use elpa1_compute
   use elpa2_compute
   use elpa_mpi
784
   implicit none
785
786
787
788
789
790
791
792
793
   logical, intent(in), optional             :: useQR
   logical                                   :: useQRActual, useQREnvironment
   integer(kind=c_int), intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
   integer(kind=c_int)                       :: THIS_REAL_ELPA_KERNEL

   integer(kind=c_int), intent(in)           :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, &
                                                mpi_comm_cols, mpi_comm_all
   integer(kind=c_int), intent(in)           :: nblk
   real(kind=c_float), intent(inout)         :: ev(na)
794
#ifdef USE_ASSUMED_SIZE
795
   real(kind=c_float), intent(inout)         :: a(lda,*),  q(ldq,*)
796
797

#else
798
   real(kind=c_float), intent(inout)         :: a(lda,matrixCols),  q(ldq,matrixCols)
799
#endif
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
   real(kind=c_float), allocatable           :: hh_trans_real(:,:)

   integer(kind=c_int)                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer(kind=c_int)                       :: nbw, num_blocks
   real(kind=c_float), allocatable           :: tmat(:,:,:), e(:)
   integer(kind=c_intptr_t)                  :: tmat_dev, q_dev, a_dev
   real(kind=c_double)                       :: ttt0, ttt1, ttts  ! MPI_WTIME always needs double
   integer(kind=c_int)                       :: i
   logical                                   :: success
   logical, save                             :: firstCall = .true.
   logical                                   :: wantDebug
   integer(kind=c_int)                       :: istat
   character(200)                            :: errorMessage
   logical                                   :: useGPU
   integer(kind=c_int)                       :: numberOfGPUDevices
Andreas Marek's avatar
Andreas Marek committed
815

816
#ifdef HAVE_DETAILED_TIMINGS
817
    call timer%start("solve_evp_real_2stage_single")
818
#endif
819
820
821
#ifdef HAVE_DETAILED_TIMINGS
    call timer%start("mpi_communication")
#endif
822
823
824
825
826
827
828
    call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
    call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

    call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
    call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
    call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
    call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
829
830
831
#ifdef HAVE_DETAILED_TIMINGS
    call timer%stop("mpi_communication")
#endif
832
833
834
835
836
837
    wantDebug = .false.
    if (firstCall) then
      ! are debug messages desired?
      wantDebug = debug_messages_via_environment_variable()
      firstCall = .false.
    endif
838

839
    success = .true.
840

841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
    useQRActual = .false.
    useGPU      = .false.

    ! set usage of qr decomposition via API call
    if (present(useQR)) then
      if (useQR) useQRActual = .true.
        if (.not.(useQR)) useQRACtual = .false.
    endif

    ! overwrite this with environment variable settings
    if (qr_decomposition_via_environment_variable(useQREnvironment)) then
      useQRActual = useQREnvironment
    endif

    if (useQRActual) then
856
      if (mod(na,2) .ne. 0) then
857
858
859
860
861
862
863
864
865
866
        if (wantDebug) then
          write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
        endif
        print *, "Do not use QR-decomposition for this matrix and blocksize."
        success = .false.
        return
      endif
    endif

    if (present(THIS_REAL_ELPA_KERNEL_API)) then
867
      ! user defined kernel via the optional argument in the API call
868
      THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
869
    else
870

871
872
      ! if kernel is not choosen via api
      ! check whether set by environment variable
873
      THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
874
    endif
875

876
    ! check whether choosen kernel is allowed
877
    if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
878
879
880

      if (my_pe == 0) then
        write(error_unit,*) " "
881
        write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
882
883
884
        write(error_unit,*) "is not in the list of the allowed kernels!"
        write(error_unit,*) " "
        write(error_unit,*) "Allowed kernels are:"
885
886
887
        do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
          if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
            write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
888
889
          endif
        enddo
890

891
        write(error_unit,*) " "
892
893
894
895
896
897
898
899
900
901
902
        ! check whether generic kernel is defined
         if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
           write(error_unit,*) "The default kernel REAL_ELPA_KERNEL_GENERIC will be used !"
         else
           write(error_unit,*) "As default kernel ",REAL_ELPA_KERNEL_NAMES(DEFAULT_REAL_ELPA_KERNEL)," will be used"
         endif
      endif  ! my_pe == 0
      if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
        THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
      else
        THIS_REAL_ELPA_KERNEL = DEFAULT_REAL_ELPA_KERNEL
903
904
      endif
    endif
905

906
907
908
    if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
      if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
        useGPU = .true.
909
910
911
      endif
      if (nblk .ne. 128) then
        print *,"At the moment GPU version needs blocksize 128"
912
        error stop
913
      endif
914
    ! some temporarilly checks until single precision works with all kernels
915

916
917
918
919
920
921
922
      ! set the neccessary parameters
      cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
      cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
      cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
      cudaHostRegisterPortable = cuda_hostRegisterPortable()
      cudaHostRegisterMapped   = cuda_hostRegisterMapped()
    endif
923

924
    ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
925
926
927
928
929
930
931
932
933
    ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
    ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
    ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
    ! on this and maybe allow a run-time optimization here
    if (useGPU) then
      nbw = nblk
    else
      nbw = (63/nblk+1)*nblk
    endif
934

935
936
937
938
    num_blocks = (na-1)/nbw + 1

    allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
939
      print *,"solve_evp_real_2stage: error when allocating tmat "//errorMessage
940
941
      stop
    endif
942

943
    ! Reduction full -> band
944

945
946
    ttt0 = MPI_Wtime()
    ttts = ttt0
947
#ifdef DOUBLE_PRECISION_REAL
948
949
    call bandred_real_double(na, a, a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
                      tmat, tmat_dev, wantDebug, useGPU, success, useQRActual)
950
#else
951
952
    call bandred_real_single(na, a, a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
                      tmat, tmat_dev, wantDebug, useGPU, success, useQRActual)
953
#endif
954
    if (.not.(success)) return
955
956
    ttt1 = MPI_Wtime()
    if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
957
       write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
958

959
     ! Reduction band -> tridiagonal
960

961
962
963
964
965
     allocate(e(na), stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when allocating e "//errorMessage
       stop
     endif
966

967
968
969
970
971
972
973
974
     ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
     call tridiag_band_real_double(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, &
                          mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
#else
     call tridiag_band_real_single(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, &
                          mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
#endif
975

976
977
978
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
979

980
#ifdef WITH_MPI
981
982
983
#ifdef HAVE_DETAILED_TIMINGS
     call timer%start("mpi_communication")
#endif
984
985
986
#ifdef DOUBLE_PRECISION_REAL
     call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
     call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
987
#else
988
989
     call mpi_bcast(ev,na,MPI_REAL4,0,mpi_comm_all,mpierr)
     call mpi_bcast(e,na,MPI_REAL4,0,mpi_comm_all,mpierr)
990
#endif
991
992
993
#ifdef HAVE_DETAILED_TIMINGS
     call timer%stop("mpi_communication")
#endif
994
#endif /* WITH_MPI */
995
996
     ttt1 = MPI_Wtime()
     time_evp_fwd = ttt1-ttts
997

998
     ! Solve tridiagonal system
999

1000
1001
1002
1003
1004
1005
1006
1007
1008
     ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
     call solve_tridi_double(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
                      mpi_comm_cols, wantDebug, success)
#else
     call solve_tridi_single(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
                      mpi_comm_cols, wantDebug, success)
#endif
     if (.not.(success)) return
1009

1010
1011
1012
1013
1014
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
     write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
     time_evp_solve = ttt1-ttt0
     ttts = ttt1
1015

1016
1017
1018
1019
1020
1021
     deallocate(e, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when deallocating e "//errorMessage
       stop
     endif
     ! Backtransform stage 1
1022

1023
1024
     ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
1025
     call trans_ev_tridi_to_band_real_double(na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, hh_trans_real, &
1026
1027
1028
                                    mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success,      &
                                    THIS_REAL_ELPA_KERNEL)
#else
1029
     call trans_ev_tridi_to_band_real_single(na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, hh_trans_real, &
1030
1031
1032
                                    mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success,      &
                                    THIS_REAL_ELPA_KERNEL)
#endif
1033

1034
1035
1036
1037
     if (.not.(success)) return
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
1038

1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
     ! We can now deallocate the stored householder vectors
     deallocate(hh_trans_real, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when deallocating hh_trans_real "//errorMessage
       stop
     endif


     ! Backtransform stage 2
     print *,"useGPU== ",useGPU
     ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Typo    
Andreas Marek committed
1051
1052
     call trans_ev_band_to_full_real_double(na, nev, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, q_dev, ldq, &
                                            matrixCols, num_blocks, mpi_comm_rows, &
1053
                                            mpi_comm_cols, useGPU, useQRActual)
1054
#else
Andreas Marek's avatar
Typo    
Andreas Marek committed
1055
1056
     call trans_ev_band_to_full_real_single(na, nev, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, q_dev, ldq, &
                                            matrixCols, num_blocks, mpi_comm_rows, &
1057
                                            mpi_comm_cols, useGPU, useQRActual)
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
#endif

     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
     time_evp_back = ttt1-ttts

     deallocate(tmat, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when deallocating tmat"//errorMessage
       stop
     endif

#ifdef HAVE_DETAILED_TIMINGS
     call timer%stop("solve_evp_real_2stage_single")
#endif
1    format(a,f10.3)

#ifdef DOUBLE_PRECISION_REAL
   end function solve_evp_real_2stage_double
#else
   end function solve_evp_real_2stage_single
#endif

#endif /* WANT_SINGLE_PRECISION_REAL */

1084
!>  \brief solve_evp_complex_2stage_double: Fortran function to solve the double-precision complex eigenvalue problem with a 2 stage approach
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------
#define DOUBLE_PRECISION_COMPLEX 1

#ifdef DOUBLE_PRECISION_COMPLEX
function solve_evp_complex_2stage_double(na, nev, a, lda, ev, q, ldq, nblk, &
                                  matrixCols, mpi_comm_rows, mpi_comm_cols,      &
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
#else
function solve_evp_complex_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, &
                                  matrixCols, mpi_comm_rows, mpi_comm_cols,      &
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
#endif


#ifdef HAVE_DETAILED_TIMINGS
   use timings
#endif
1136
1137
1138
   use elpa1_compute
   use elpa2_compute
   use elpa_mpi
1139
1140
1141
1142
   use cuda_functions
   use mod_check_for_gpu
   use iso_c_binding
   implicit none
1143
1144
1145
1146
   integer(kind=c_int), intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
   integer(kind=c_int)                       :: THIS_COMPLEX_ELPA_KERNEL
   integer(kind=c_int), intent(in)           :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
   real(kind=c_double), intent(inout)        :: ev(na)
1147
#ifdef USE_ASSUMED_SIZE
1148
   complex(kind=c_double), intent(inout)     :: a(lda,*), q(ldq,*)
1149
#else
1150
   complex(kind=c_double), intent(inout)     :: a(lda,matrixCols), q(ldq,matrixCols)
1151
#endif
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
   complex(kind=c_double), allocatable       :: hh_trans_complex(:,:)

   integer(kind=c_int)                       :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes
   integer(kind=c_int)                       :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
   complex(kind=c_double), allocatable       :: tmat(:,:,:)
   real(kind=c_double), allocatable          :: q_real(:,:), e(:)
   real(kind=c_double)                       :: ttt0, ttt1, ttts  ! MPI_WTIME always needs double
   integer(kind=c_int)                       :: i

   logical                                   :: success, wantDebug
   logical, save                             :: firstCall = .true.
   integer(kind=c_int)                       :: istat
   character(200)                            :: errorMessage
   logical                                   :: useGPU
   integer(kind=c_int)                       :: numberOfGPUDevices
1167
1168
1169
1170

#ifdef HAVE_DETAILED_TIMINGS
    call timer%start("solve_evp_complex_2stage_double")
#endif
1171
1172
1173
#ifdef HAVE_DETAILED_TIMINGS
    call timer%start("mpi_communication")
#endif
1174
1175
1176
1177
1178
1179
1180
    call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
    call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

    call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
    call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
    call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
    call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
1181
1182
1183
#ifdef HAVE_DETAILED_TIMINGS
    call timer%stop("mpi_communication")
#endif
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
    useGPU = .false.
    wantDebug = .false.
    if (firstCall) then
      ! are debug messages desired?
      wantDebug = debug_messages_via_environment_variable()
      firstCall = .false.
    endif


    success = .true.

    if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then
      ! user defined kernel via the optional argument in the API call
      THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API
    else
      ! if kernel is not choosen via api
      ! check whether set by environment variable
      THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel()
    endif

    ! check whether choosen kernel is allowed
    if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then

      if (my_pe == 0) then
        write(error_unit,*) " "
        write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL)
        write(error_unit,*) "is not in the list of the allowed kernels!"
        write(error_unit,*) " "
        write(error_unit,*) "Allowed kernels are:"
        do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
          if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then
            write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
          endif
        enddo

        write(error_unit,*) " "
        write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
      endif
      THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
    endif
1224

1225
1226
1227
1228
1229
1230
    if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
      if (check_for_gpu(my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then
        useGPU=.true.
      endif
      if (nblk .ne. 128) then
        print *,"At the moment GPU version needs blocksize 128"
1231
        error stop
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
      endif

      ! set the neccessary parameters
      cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
      cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
      cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
      cudaHostRegisterPortable = cuda_hostRegisterPortable()
      cudaHostRegisterMapped   = cuda_hostRegisterMapped()
    endif

    ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

    nbw = (31/nblk+1)*nblk

    num_blocks = (na-1)/nbw + 1

    allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
      print *,"solve_evp_complex_2stage: error when allocating tmat"//errorMessage
      stop
    endif
    ! Reduction full -> band

    ttt0 = MPI_Wtime()
    ttts = ttt0
#ifdef DOUBLE_PRECISION_COMPLEX
    call bandred_complex_double(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
                         tmat, wantDebug, useGPU, success)
#else
    call bandred_complex_single(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
                         tmat, wantDebug, useGPU, success)
#endif
    if (.not.(success)) then

#ifdef HAVE_DETAILED_TIMINGS
      call timer%stop("solve_evp_complex_2stage_double")
#endif
      return
    endif
    ttt1 = MPI_Wtime()
    if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time bandred_complex               :',ttt1-ttt0

    ! Reduction band -> tridiagonal

    allocate(e(na), stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
      print *,"solve_evp_complex_2stage: error when allocating e"//errorMessage
      stop
    endif


    ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_COMPLEX
   call tridiag_band_complex_double(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_complex, &
                             mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
#else
   call tridiag_band_complex_single(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_complex, &
                             mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
#endif

    ttt1 = MPI_Wtime()
    if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time tridiag_band_complex          :',ttt1-ttt0

#ifdef WITH_MPI
1298
1299
1300
#ifdef HAVE_DETAILED_TIMINGS
    call timer%start("mpi_communication")
#endif
1301
1302
1303
1304
1305
1306