elpa2.F90 75.6 KB
Newer Older
1
!   This file is part of ELPA.
2
3
4
5
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
6
7
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8
9
10
11
12
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14
15
16
17
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
18
!    This particular source code file contains additions, changes and
Andreas Marek's avatar
Andreas Marek committed
19
!    enhancements authored by Intel Corporation which is not part of
20
!    the ELPA consortium.
21
22
!
!    More information can be found here:
23
!    http://elpa.mpcdf.mpg.de/
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".


#include "config-f90.h"
64
!> \brief Fortran module which provides the routines to use the 2-stage ELPA solver
65
66
67
68
module ELPA2

! Version 1.1.2, 2011-02-21

69
  use elpa_utilities
70
  use elpa1, only : elpa_print_times, time_evp_back, time_evp_fwd, time_evp_solve
71
  use elpa2_utilities
72

73
74
75
76
77
78
  implicit none

  PRIVATE ! By default, all routines contained are private

  ! The following routines are public:

79
80
81
82
  public :: solve_evp_real_2stage_double               !< old, deprecated interface: Driver routine for real double-precision eigenvalue problem. will be deleted at some point
  public :: solve_evp_complex_2stage_double            !< old, deprecated interface: Driver routine for complex double-precision eigenvalue problem. will be deleted at some point
  public :: elpa_solve_evp_real_2stage_double          !< Driver routine for real double-precision 2-stage eigenvalue problem
  public :: elpa_solve_evp_complex_2stage_double       !< Driver routine for complex double-precision 2-stage eigenvalue problem
83

84

85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
!-------------------------------------------------------------------------------
!>  \brief solve_evp_real_2stage: Old, deprecated interface for elpa_solve_evp_real_2stage_double
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
120
121
122
!>  \param useQR (optional)                     use QR decomposition
!>  \param useGPU (optional)                    decide whether to use GPUs or not

123
124
125
!>
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------
126
127
128
129
  interface solve_evp_real_2stage
    module procedure solve_evp_real_2stage_double
  end interface

130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
!-------------------------------------------------------------------------------
!>  \brief elpa_solve_evp_real_2stage_double: Fortran function to solve the real double-precision eigenvalue problem with a 2 stage approach. This is called by "elpa_solve_evp_real_double"
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
165
166
!>  \param useQR (optional)                     use QR decomposition
!>  \param useGPU (optional)                    decide whether to use GPUs or not
167
168
169
!>
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------
170
171
172
173
  interface elpa_solve_evp_real_2stage_double
    module procedure solve_evp_real_2stage_double
  end interface

174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
!-------------------------------------------------------------------------------
!>  \brief solve_evp_complex_2stage: Old, deprecated interface for elpa_solve_evp_complex_2stage_double
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
209
210
!>  \param useGPU (optional)                    decide whether to use GPUs or not
!>
211
212
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------
213
214
215
216
  interface solve_evp_complex_2stage
    module procedure solve_evp_complex_2stage_double
  end interface

217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
!-------------------------------------------------------------------------------
!>  \brief elpa_solve_evp_complex_2stage_double: Fortran function to solve the complex double-precision eigenvalue problem with a 2 stage approach. This is called by "elpa_solve_evp_complex_double"
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
252
253
!>  \param useGPU (optional)                    decide whether to use GPUs or not
!>
254
255
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------
256
257
258
259
  interface elpa_solve_evp_complex_2stage_double
    module procedure solve_evp_complex_2stage_double
  end interface

260
261
#ifdef WANT_SINGLE_PRECISION_REAL
  public :: solve_evp_real_2stage_single
262
  public :: elpa_solve_evp_real_2stage_single
263
264
265
266
#endif

#ifdef WANT_SINGLE_PRECISION_COMPLEX
  public :: solve_evp_complex_2stage_single
267
  public :: elpa_solve_evp_complex_2stage_single
268
269
#endif

270
#ifdef WANT_SINGLE_PRECISION_REAL
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
!-------------------------------------------------------------------------------
!>  \brief elpa_solve_evp_real_2stage_single: Fortran function to solve the real single-precision eigenvalue problem with a 2 stage approach. This is called by "elpa_solve_evp_real_single"
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
306
307
!>  \param useQR (optional)                     use QR decomposition
!>  \param useGPU (optional)                    decide whether to use GPUs or not
308
309
310
!>
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------
311
312
313
314
315
316
  interface elpa_solve_evp_real_2stage_single
    module procedure solve_evp_real_2stage_single
  end interface
#endif

#ifdef WANT_SINGLE_PRECISION_COMPLEX
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
!-------------------------------------------------------------------------------
!>  \brief elpa_solve_evp_complex_2stage_single: Fortran function to solve the complex double-precision eigenvalue problem with a 2 stage approach. This is called by "elpa_solve_evp_complex_single"
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
352
353
!>  \param useGPU (optional)                    decide whether to use GPUs or not
!>
354
355
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------
356
  interface elpa_solve_evp_complex_2stage_single
357
358
359
    module procedure solve_evp_complex_2stage_single
  end interface
#endif
360

361
  contains
362
!-------------------------------------------------------------------------------
363
!>  \brief solve_evp_real_2stage_double: Fortran function to solve the double-precision real eigenvalue problem with a 2 stage approach
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
397
398
!>  \param useQR (optional)                     use QR decomposition
!>  \param useGPU (optional)                    decide whether to use GPUs or not
399
400
401
!>
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------
402

403
404
405
406
407
408
409
410
! #undef PRECISION
! #undef MATH_DATATYPE
! #define MATH_DATATYPE real
! #define PRECISION double
! #include "elpa2_template.X90"



411
412
413
414
415
416
417
#define DOUBLE_PRECISION_REAL

#ifdef DOUBLE_PRECISION_REAL
  function solve_evp_real_2stage_double(na, nev, a, lda, ev, q, ldq, nblk,        &
                               matrixCols,                               &
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
418
                                 useQR, useGPU) result(success)
419
420
#else
  function solve_evp_real_2stage_single(na, nev, a, lda, ev, q, ldq, nblk,        &
421
                               matrixCols,                               &
422
423
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
424
                                 useQR, useGPU) result(success)
425
#endif
426

427

428
#ifdef HAVE_DETAILED_TIMINGS
429
    use timings
430
#endif
431
   use elpa1_utilities, only : gpu_usage_via_environment_variable
432
433
434
   use elpa1_compute
   use elpa2_compute
   use elpa_mpi
435
436
   use cuda_functions
   use mod_check_for_gpu
437
   use iso_c_binding
438
   implicit none
439
440
   logical, intent(in), optional             :: useQR, useGPU
   logical                                   :: useQRActual, useQREnvironment
441
442
   integer(kind=c_int), intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
   integer(kind=c_int)                       :: THIS_REAL_ELPA_KERNEL
Andreas Marek's avatar
Andreas Marek committed
443

444
445
446
447
   integer(kind=c_int), intent(in)           :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, &
                                                mpi_comm_cols, mpi_comm_all
   integer(kind=c_int), intent(in)           :: nblk
   real(kind=c_double), intent(inout)        :: ev(na)
448
#ifdef USE_ASSUMED_SIZE
449
   real(kind=c_double), intent(inout)        :: a(lda,*), q(ldq,*)
450
#else
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
   real(kind=c_double), intent(inout)        :: a(lda,matrixCols), q(ldq,matrixCols)
#endif
   real(kind=c_double), allocatable          :: hh_trans_real(:,:)

   integer(kind=c_int)                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer(kind=c_int)                       :: nbw, num_blocks
   real(kind=c_double), allocatable          :: tmat(:,:,:), e(:)
   integer(kind=c_intptr_t)                  :: tmat_dev, q_dev, a_dev
   real(kind=c_double)                       :: ttt0, ttt1, ttts  ! MPI_WTIME always needs double
   integer(kind=c_int)                       :: i
   logical                                   :: success
   logical, save                             :: firstCall = .true.
   logical                                   :: wantDebug
   integer(kind=c_int)                       :: istat
   character(200)                            :: errorMessage
   logical                                   :: do_useGPU
   integer(kind=c_int)                       :: numberOfGPUDevices
Andreas Marek's avatar
Andreas Marek committed
468

469
#ifdef HAVE_DETAILED_TIMINGS
470
    call timer%start("solve_evp_real_2stage_double")
471
#endif
472

473
474
475
#ifdef HAVE_DETAILED_TIMINGS
    call timer%start("mpi_communication")
#endif
476
477
    call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
    call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
478

479
480
481
482
    call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
    call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
    call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
    call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
483
484
485
#ifdef HAVE_DETAILED_TIMINGS
    call timer%stop("mpi_communication")
#endif
486

487
488
489
490
491
492
    wantDebug = .false.
    if (firstCall) then
      ! are debug messages desired?
      wantDebug = debug_messages_via_environment_variable()
      firstCall = .false.
    endif
493

494
    success = .true.
495

496
    useQRActual = .false.
497
    do_useGPU      = .false.
498

499
500
501
502
503
    ! set usage of qr decomposition via API call
    if (present(useQR)) then
      if (useQR) useQRActual = .true.
        if (.not.(useQR)) useQRACtual = .false.
    endif
504

505
506
507
508
    ! overwrite this with environment variable settings
    if (qr_decomposition_via_environment_variable(useQREnvironment)) then
      useQRActual = useQREnvironment
    endif
509

510
    if (useQRActual) then
511
      if (mod(na,2) .ne. 0) then
512
513
514
515
516
517
518
519
        if (wantDebug) then
          write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
        endif
        print *, "Do not use QR-decomposition for this matrix and blocksize."
        success = .false.
        return
      endif
    endif
520

521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
    if (present(useGPU)) then
      if (useGPU) then
        if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then

           do_useGPU = .true.
           ! set the neccessary parameters
           cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
           cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
           cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
           cudaHostRegisterPortable = cuda_hostRegisterPortable()
           cudaHostRegisterMapped   = cuda_hostRegisterMapped()
        else
          print *,"GPUs are requested but not detected! Aborting..."
          success = .false.
          return
        endif
      endif
    else
      ! check whether set by environment variable
      do_useGPU = gpu_usage_via_environment_variable()
   endif
542

543
544
545
546
    if (present(THIS_REAL_ELPA_KERNEL_API)) then
      ! user defined kernel via the optional argument in the API call
      THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
    else
547

548
549
550
551
      ! if kernel is not choosen via api
      ! check whether set by environment variable
      THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
    endif
Andreas Marek's avatar
Andreas Marek committed
552

553
    ! check whether choosen kernel is allowed: function returns true if NOT allowed! change this
554
555
556
557
558
559
560
561
562
563
564
565
566
    if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then

      if (my_pe == 0) then
        write(error_unit,*) " "
        write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
        write(error_unit,*) "is not in the list of the allowed kernels!"
        write(error_unit,*) " "
        write(error_unit,*) "Allowed kernels are:"
        do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
          if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
            write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
          endif
        enddo
Andreas Marek's avatar
Andreas Marek committed
567

568
        write(error_unit,*) " "
569
570
571
572
573
574
575
576
577
578
579
        ! check whether generic kernel is defined
         if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
           write(error_unit,*) "The default kernel REAL_ELPA_KERNEL_GENERIC will be used !"
         else
           write(error_unit,*) "As default kernel ",REAL_ELPA_KERNEL_NAMES(DEFAULT_REAL_ELPA_KERNEL)," will be used"
         endif
      endif  ! my_pe == 0
      if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
        THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
      else
        THIS_REAL_ELPA_KERNEL = DEFAULT_REAL_ELPA_KERNEL
580
581
582
      endif
    endif

583
584
585
586
587
588
    ! check consistency between request for GPUs and defined kernel
    if (do_useGPU) then
      if (THIS_REAL_ELPA_KERNEL .ne. REAL_ELPA_KERNEL_GPU) then
        write(error_unit,*) "GPU usage has been requested but compute kernel is defined as non-GPU! Aborting..."
        success = .false.
        return
589
590
      endif
    endif
591

592
593
594
595
596
597
598
599
    if (do_useGPU) then
      if (nblk .ne. 128) then
        write(error_unit,*) "In case of GPU usage the blocksize for ELPA 2stage has to be 128"
        success = .false.
        return
      endif
    endif

600
    ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
601
602
603
604
    ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
    ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
    ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
    ! on this and maybe allow a run-time optimization here
605
    if (do_useGPU) then
606
607
      nbw = nblk
    else
608
      nbw = (63/nblk+1)*nblk
609
    endif
610
    write(*,*) "nbw", nbw
611

612
    num_blocks = (na-1)/nbw + 1
613

614
615
616
617
618
    allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
      print *,"solve_evp_real_2stage: error when allocating tmat "//errorMessage
      stop
    endif
619

620
    ! Reduction full -> band
621

622
623
    ttt0 = MPI_Wtime()
    ttts = ttt0
624
#ifdef DOUBLE_PRECISION_REAL
625
    call bandred_real_double(na, a, a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
626
        tmat, tmat_dev, wantDebug, do_useGPU, success, useQRActual)
627
#else
628
    call bandred_real_single(na, a, a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
629
        tmat, tmat_dev, wantDebug, do_useGPU, success, useQRActual)
630
#endif
631
632
633
634
    if (.not.(success)) return
    ttt1 = MPI_Wtime()
    if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
635

636
     ! Reduction band -> tridiagonal
637

638
639
640
641
642
     allocate(e(na), stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when allocating e "//errorMessage
       stop
     endif
643

644
     ttt0 = MPI_Wtime()
645
646
#ifdef DOUBLE_PRECISION_REAL
     call tridiag_band_real_double(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, &
647
                          mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
648
649
650
651
#else
     call tridiag_band_real_single(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, &
                          mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
#endif
652

653
654
655
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
656

657
#ifdef WITH_MPI
658
659
660
#ifdef HAVE_DETAILED_TIMINGS
     call timer%start("mpi_communication")
#endif
661
#ifdef DOUBLE_PRECISION_REAL
662
663
     call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
     call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
664
665
666
667
#else
     call mpi_bcast(ev,na,MPI_REAL4,0,mpi_comm_all,mpierr)
     call mpi_bcast(e,na,MPI_REAL4,0,mpi_comm_all,mpierr)
#endif
668
669
670
#ifdef HAVE_DETAILED_TIMINGS
     call timer%stop("mpi_communication")
#endif
671
#endif /* WITH_MPI */
672
673
     ttt1 = MPI_Wtime()
     time_evp_fwd = ttt1-ttts
674

675
     ! Solve tridiagonal system
676

677
     ttt0 = MPI_Wtime()
678
679
#ifdef DOUBLE_PRECISION_REAL
     call solve_tridi_double(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
680
                      mpi_comm_cols, wantDebug, success)
681
682
683
684
#else
     call solve_tridi_single(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
                      mpi_comm_cols, wantDebug, success)
#endif
685
686
687
688
     if (.not.(success)) return

     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
689
     write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
690
691
     time_evp_solve = ttt1-ttt0
     ttts = ttt1
692

693
694
695
696
697
698
699
700
     deallocate(e, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when deallocating e "//errorMessage
       stop
     endif
     ! Backtransform stage 1

     ttt0 = MPI_Wtime()
701
#ifdef DOUBLE_PRECISION_REAL
702
     call trans_ev_tridi_to_band_real_double(na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, hh_trans_real, &
703
         mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU, success,      &
704
705
                                    THIS_REAL_ELPA_KERNEL)
#else
706
     call trans_ev_tridi_to_band_real_single(na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, hh_trans_real, &
707
         mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU, success,      &
708
                                    THIS_REAL_ELPA_KERNEL)
709
#endif
710

711
712
713
714
     if (.not.(success)) return
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
715

716
717
718
719
720
721
     ! We can now deallocate the stored householder vectors
     deallocate(hh_trans_real, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when deallocating hh_trans_real "//errorMessage
       stop
     endif
722
723


724
725
     ! Backtransform stage 2
     ttt0 = MPI_Wtime()
726
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Typo    
Andreas Marek committed
727
728
     call trans_ev_band_to_full_real_double(na, nev, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, q_dev, ldq, &
                                            matrixCols, num_blocks, mpi_comm_rows, &
729
                                            mpi_comm_cols, do_useGPU, useQRActual)
730
#else
Andreas Marek's avatar
Typo    
Andreas Marek committed
731
732
     call trans_ev_band_to_full_real_single(na, nev, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, q_dev, ldq, &
                                            matrixCols, num_blocks, mpi_comm_rows, &
733
                                            mpi_comm_cols, do_useGPU, useQRActual)
734
#endif
735

736
737
738
739
740
741
742
743
744
745
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
     time_evp_back = ttt1-ttts

     deallocate(tmat, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when deallocating tmat"//errorMessage
       stop
     endif
746

747
#ifdef HAVE_DETAILED_TIMINGS
748
     call timer%stop("solve_evp_real_2stage_double")
749
#endif
750
1    format(a,f10.3)
751

752
753
754
755
756
#ifdef DOUBLE_PRECISION_REAL
   end function solve_evp_real_2stage_double
#else
   end function solve_evp_real_2stage_single
#endif
757

758
759
760
761
#ifdef WANT_SINGLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
!-------------------------------------------------------------------------------
!>  \brief solve_evp_real_2stage_single: Fortran function to solve the single-precision real eigenvalue problem with a 2 stage approach
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
795
796
!>  \param useQR (optional)                     use QR decomposition
!>  \param useGPU (optional)                    decide whether GPUs should be used or not
797
!>
798
!>  \result success                             logical, false if error occured
799
!-------------------------------------------------------------------------------
800
801
802
803
804
805

#ifdef DOUBLE_PRECISION_REAL
  function solve_evp_real_2stage_double(na, nev, a, lda, ev, q, ldq, nblk,        &
                               matrixCols,                               &
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
806
                                 useQR, useGPU) result(success)
807
808
809
810
811
#else
  function solve_evp_real_2stage_single(na, nev, a, lda, ev, q, ldq, nblk,        &
                               matrixCols,                               &
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
812
                                 useQR, useGPU) result(success)
813
#endif
814

815
#ifdef HAVE_DETAILED_TIMINGS
816
    use timings
817
#endif
818
   use elpa1_utilities, only : gpu_usage_via_environment_variable
819

820
821
   use cuda_functions
   use mod_check_for_gpu
822
   use iso_c_binding
823
824
825
   use elpa1_compute
   use elpa2_compute
   use elpa_mpi
826
   implicit none
827
   logical, intent(in), optional             :: useQR, useGPU
828
829
830
831
832
833
834
835
   logical                                   :: useQRActual, useQREnvironment
   integer(kind=c_int), intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
   integer(kind=c_int)                       :: THIS_REAL_ELPA_KERNEL

   integer(kind=c_int), intent(in)           :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, &
                                                mpi_comm_cols, mpi_comm_all
   integer(kind=c_int), intent(in)           :: nblk
   real(kind=c_float), intent(inout)         :: ev(na)
836
#ifdef USE_ASSUMED_SIZE
837
   real(kind=c_float), intent(inout)         :: a(lda,*),  q(ldq,*)
838
839

#else
840
   real(kind=c_float), intent(inout)         :: a(lda,matrixCols),  q(ldq,matrixCols)
841
#endif
842
843
844
845
846
847
848
849
850
851
852
853
854
   real(kind=c_float), allocatable           :: hh_trans_real(:,:)

   integer(kind=c_int)                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer(kind=c_int)                       :: nbw, num_blocks
   real(kind=c_float), allocatable           :: tmat(:,:,:), e(:)
   integer(kind=c_intptr_t)                  :: tmat_dev, q_dev, a_dev
   real(kind=c_double)                       :: ttt0, ttt1, ttts  ! MPI_WTIME always needs double
   integer(kind=c_int)                       :: i
   logical                                   :: success
   logical, save                             :: firstCall = .true.
   logical                                   :: wantDebug
   integer(kind=c_int)                       :: istat
   character(200)                            :: errorMessage
855
   logical                                   :: do_useGPU
856
   integer(kind=c_int)                       :: numberOfGPUDevices
Andreas Marek's avatar
Andreas Marek committed
857

858
#ifdef HAVE_DETAILED_TIMINGS
859
    call timer%start("solve_evp_real_2stage_single")
860
#endif
861
862
863
#ifdef HAVE_DETAILED_TIMINGS
    call timer%start("mpi_communication")
#endif
864
865
866
867
868
869
870
    call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
    call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

    call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
    call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
    call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
    call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
871
872
873
#ifdef HAVE_DETAILED_TIMINGS
    call timer%stop("mpi_communication")
#endif
874
875
876
877
878
879
    wantDebug = .false.
    if (firstCall) then
      ! are debug messages desired?
      wantDebug = debug_messages_via_environment_variable()
      firstCall = .false.
    endif
880

881
    success = .true.
882

883
    useQRActual = .false.
884
    do_useGPU   = .false.
885
886
887
888
889
890
891
892
893
894
895
896
897

    ! set usage of qr decomposition via API call
    if (present(useQR)) then
      if (useQR) useQRActual = .true.
        if (.not.(useQR)) useQRACtual = .false.
    endif

    ! overwrite this with environment variable settings
    if (qr_decomposition_via_environment_variable(useQREnvironment)) then
      useQRActual = useQREnvironment
    endif

    if (useQRActual) then
898
      if (mod(na,2) .ne. 0) then
899
900
901
902
903
904
905
906
907
        if (wantDebug) then
          write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
        endif
        print *, "Do not use QR-decomposition for this matrix and blocksize."
        success = .false.
        return
      endif
    endif

908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
    if (present(useGPU)) then
      if (useGPU) then
        if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then

           do_useGPU = .true.
           ! set the neccessary parameters
           cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
           cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
           cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
           cudaHostRegisterPortable = cuda_hostRegisterPortable()
           cudaHostRegisterMapped   = cuda_hostRegisterMapped()
        else
          write(error_unit,*) "GPUs are requested but not detected! Aborting..."
          success = .false.
          return
        endif
      endif
    else
      ! check whether set by environment variable
      do_useGPU = gpu_usage_via_environment_variable()
   endif


931
    if (present(THIS_REAL_ELPA_KERNEL_API)) then
932
      ! user defined kernel via the optional argument in the API call
933
      THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
934
    else
935

936
937
      ! if kernel is not choosen via api
      ! check whether set by environment variable
938
      THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
939
    endif
940

941
    ! check whether choosen kernel is allowed
942
    if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
943
944
945

      if (my_pe == 0) then
        write(error_unit,*) " "
946
        write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
947
948
949
        write(error_unit,*) "is not in the list of the allowed kernels!"
        write(error_unit,*) " "
        write(error_unit,*) "Allowed kernels are:"
950
951
952
        do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
          if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
            write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
953
954
          endif
        enddo
955

956
        write(error_unit,*) " "
957
958
959
960
961
962
963
964
965
966
967
        ! check whether generic kernel is defined
         if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
           write(error_unit,*) "The default kernel REAL_ELPA_KERNEL_GENERIC will be used !"
         else
           write(error_unit,*) "As default kernel ",REAL_ELPA_KERNEL_NAMES(DEFAULT_REAL_ELPA_KERNEL)," will be used"
         endif
      endif  ! my_pe == 0
      if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
        THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
      else
        THIS_REAL_ELPA_KERNEL = DEFAULT_REAL_ELPA_KERNEL
968
969
      endif
    endif
970

971
972
973
974
975
976
    ! check consistency between request for GPUs and defined kernel
    if (do_useGPU) then
      if (THIS_REAL_ELPA_KERNEL .ne. REAL_ELPA_KERNEL_GPU) then
        write(error_unit,*) "GPU usage has been requested but compute kernel is defined as non-GPU! Aborting..."
        success = .false.
        return
977
978
      endif
    endif
979

980
981
982
983
984
985
986
    if (do_useGPU) then
      if (nblk .ne. 128) then
        write(error_unit,*) "In case of GPU usage the blocksize for ELPA 2stage has to be 128"
        success = .false.
        return
      endif
    endif
987
    ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
988
989
990
991
    ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
    ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
    ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
    ! on this and maybe allow a run-time optimization here
992
    if (do_useGPU) then
993
994
995
996
      nbw = nblk
    else
      nbw = (63/nblk+1)*nblk
    endif
997

998
999
1000
1001
    num_blocks = (na-1)/nbw + 1

    allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
1002
      print *,"solve_evp_real_2stage: error when allocating tmat "//errorMessage
1003
1004
      stop
    endif
1005

1006
    ! Reduction full -> band
1007

1008
1009
    ttt0 = MPI_Wtime()
    ttts = ttt0
1010
#ifdef DOUBLE_PRECISION_REAL
1011
    call bandred_real_double(na, a, a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
1012
        tmat, tmat_dev, wantDebug, do_useGPU, success, useQRActual)
1013
#else
1014
    call bandred_real_single(na, a, a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
1015
        tmat, tmat_dev, wantDebug, do_useGPU, success, useQRActual)
1016
#endif
1017
    if (.not.(success)) return
1018
1019
    ttt1 = MPI_Wtime()
    if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
1020
       write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
1021

1022
     ! Reduction band -> tridiagonal
1023

1024
1025
1026
1027
1028
     allocate(e(na), stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when allocating e "//errorMessage
       stop
     endif
1029

1030
1031
1032
1033
1034
1035
1036
1037
     ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
     call tridiag_band_real_double(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, &
                          mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
#else
     call tridiag_band_real_single(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, &
                          mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
#endif
1038

1039
1040
1041
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
1042

1043
#ifdef WITH_MPI
1044
1045
1046
#ifdef HAVE_DETAILED_TIMINGS
     call timer%start("mpi_communication")
#endif
1047
1048
1049
#ifdef DOUBLE_PRECISION_REAL
     call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
     call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
1050
#else
1051
1052
     call mpi_bcast(ev,na,MPI_REAL4,0,mpi_comm_all,mpierr)
     call mpi_bcast(e,na,MPI_REAL4,0,mpi_comm_all,mpierr)
1053
#endif
1054
1055
1056
#ifdef HAVE_DETAILED_TIMINGS
     call timer%stop("mpi_communication")
#endif
1057
#endif /* WITH_MPI */
1058
1059
     ttt1 = MPI_Wtime()
     time_evp_fwd = ttt1-ttts
1060

1061
     ! Solve tridiagonal system
1062

1063
1064
1065
1066
1067
1068
1069
1070
1071
     ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
     call solve_tridi_double(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
                      mpi_comm_cols, wantDebug, success)
#else
     call solve_tridi_single(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
                      mpi_comm_cols, wantDebug, success)
#endif
     if (.not.(success)) return
1072

1073
1074
1075
1076
1077
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
     write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
     time_evp_solve = ttt1-ttt0
     ttts = ttt1
1078

1079
1080
1081
1082
1083
1084
     deallocate(e, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when deallocating e "//errorMessage
       stop
     endif
     ! Backtransform stage 1
1085

1086
1087
     ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
1088
     call trans_ev_tridi_to_band_real_double(na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, hh_trans_real, &
1089
                                            mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU, success,      &
1090
1091
                                    THIS_REAL_ELPA_KERNEL)
#else
1092
     call trans_ev_tridi_to_band_real_single(na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, hh_trans_real, &
1093
                                             mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU, success,      &
1094
1095
                                    THIS_REAL_ELPA_KERNEL)
#endif
1096

1097
1098
1099
1100
     if (.not.(success)) return
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
1101

1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
     ! We can now deallocate the stored householder vectors
     deallocate(hh_trans_real, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when deallocating hh_trans_real "//errorMessage
       stop
     endif


     ! Backtransform stage 2
     ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Typo    
Andreas Marek committed
1113
1114
     call trans_ev_band_to_full_real_double(na, nev, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, q_dev, ldq, &
                                            matrixCols, num_blocks, mpi_comm_rows, &
1115
                                            mpi_comm_cols, do_useGPU, useQRActual)
1116
#else
Andreas Marek's avatar
Typo    
Andreas Marek committed
1117
1118
     call trans_ev_band_to_full_real_single(na, nev, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, q_dev, ldq, &
                                            matrixCols, num_blocks, mpi_comm_rows, &
1119
                                            mpi_comm_cols, do_useGPU, useQRActual)
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
#endif

     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
     time_evp_back = ttt1-ttts

     deallocate(tmat, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when deallocating tmat"//errorMessage
       stop
     endif

#ifdef HAVE_DETAILED_TIMINGS
     call timer%stop("solve_evp_real_2stage_single")
#endif
1    format(a,f10.3)

#ifdef DOUBLE_PRECISION_REAL
   end function solve_evp_real_2stage_double
#else
   end function solve_evp_real_2stage_single
#endif

#endif /* WANT_SINGLE_PRECISION_REAL */

1146
!>  \brief solve_evp_complex_2stage_double: Fortran function to solve the double-precision complex eigenvalue problem with a 2 stage approach
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
1179
1180
!>  \param useGPU (optional)                    decide whether GPUs should be used or not
!>
1181
1182
1183
1184
1185
1186
1187
1188
!>
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------
#define DOUBLE_PRECISION_COMPLEX 1

#ifdef DOUBLE_PRECISION_COMPLEX
function solve_evp_complex_2stage_double(na, nev, a, lda, ev, q, ldq, nblk, &
                                  matrixCols, mpi_comm_rows, mpi_comm_cols,      &
1189
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API, useGPU) result(success)
1190
1191
1192
#else
function solve_evp_complex_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, &
                                  matrixCols, mpi_comm_rows, mpi_comm_cols,      &
1193
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API, useGPU) result(success)
1194
1195
1196
1197
1198
1199
#endif


#ifdef HAVE_DETAILED_TIMINGS
   use timings
#endif
1200
1201
   use elpa1_utilities, only : gpu_usage_via_environment_variable

1202
1203
1204
   use elpa1_compute
   use elpa2_compute
   use elpa_mpi
1205
1206
1207
1208
   use cuda_functions
   use mod_check_for_gpu
   use iso_c_binding
   implicit none
1209
   logical, intent(in), optional             :: useGPU
1210
1211
1212
1213
   integer(kind=c_int), intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
   integer(kind=c_int)                       :: THIS_COMPLEX_ELPA_KERNEL
   integer(kind=c_int), intent(in)           :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
   real(kind=c_double), intent(inout)        :: ev(na)
1214
#ifdef USE_ASSUMED_SIZE
1215
   complex(kind=c_double), intent(inout)     :: a(lda,*), q(ldq,*)
1216
#else
1217
   complex(kind=c_double), intent(inout)     :: a(lda,matrixCols), q(ldq,matrixCols)
1218
#endif
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
   complex(kind=c_double), allocatable       :: hh_trans_complex(:,:)

   integer(kind=c_int)                       :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes
   integer(