elpa2.F90 25 KB
Newer Older
1
!   This file is part of ELPA.
2
3
4
5
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
6
7
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8
9
10
11
12
13
14
15
16
17
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
18
!    This particular source code file contains additions, changes and
Andreas Marek's avatar
Andreas Marek committed
19
!    enhancements authored by Intel Corporation which is not part of
20
!    the ELPA consortium.
21
22
!
!    More information can be found here:
23
!    http://elpa.mpcdf.mpg.de/
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".


#include "config-f90.h"
64
!> \brief Fortran module which provides the routines to use the two-stage ELPA solver
65
66
67
68
module ELPA2

! Version 1.1.2, 2011-02-21

69
  use elpa_utilities
Andreas Marek's avatar
Andreas Marek committed
70
  use elpa1_compute
71
  use elpa1, only : elpa_print_times, time_evp_back, time_evp_fwd, time_evp_solve
72
  use elpa2_utilities
73
  use elpa2_compute
74
75
  use elpa_pdgeqrf

76
  use iso_c_binding
77
78
79
80
81

#ifdef WITH_GPU_VERSION
!  use cuda_routines
!  use cuda_c_kernel
!  use iso_c_binding
82
#endif
83
84
85
86
87
88
89
90
91
92
93
94
  implicit none

  PRIVATE ! By default, all routines contained are private

  ! The following routines are public:

  public :: solve_evp_real_2stage
  public :: solve_evp_complex_2stage
  include 'mpif.h'

!******
contains
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
!-------------------------------------------------------------------------------
!>  \brief solve_evp_real_2stage: Fortran function to solve the real eigenvalue problem with a 2 stage approach
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
!>  \param use_qr (optional)                    use QR decomposition
!>
!>  \result success                             logical, false if error occured
!-------------------------------------------------------------------------------
134

135
  function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk,        &
136
                               matrixCols,                               &
137
138
139
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
                                 useQR) result(success)
140

141

142
#ifdef HAVE_DETAILED_TIMINGS
143
    use timings
144
#endif
145

Andreas Marek's avatar
Andreas Marek committed
146
   use precision
147
148
   use cuda_functions
   use mod_check_for_gpu
149
   use iso_c_binding
150
   implicit none
Andreas Marek's avatar
Andreas Marek committed
151
152
153
154
155
156
157
158
159
   logical, intent(in), optional          :: useQR
   logical                                :: useQRActual, useQREnvironment
   integer(kind=ik), intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
   integer(kind=ik)                       :: THIS_REAL_ELPA_KERNEL

   integer(kind=ik), intent(in)           :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, &
                                             mpi_comm_cols, mpi_comm_all
   integer(kind=ik), intent(in)           :: nblk
   real(kind=rk), intent(inout)           :: a(lda,matrixCols), ev(na), q(ldq,matrixCols)
160
161
   ! was
   ! real a(lda,*), q(ldq,*)
Andreas Marek's avatar
Andreas Marek committed
162
163
164
165
166
   real(kind=rk), allocatable             :: hh_trans_real(:,:)

   integer(kind=ik)                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer(kind=ik)                       :: nbw, num_blocks
   real(kind=rk), allocatable             :: tmat(:,:,:), e(:)
167
   real(kind=c_double)                    :: ttt0, ttt1, ttts  ! MPI_WTIME always needs double
Andreas Marek's avatar
Andreas Marek committed
168
169
170
171
   integer(kind=ik)                       :: i
   logical                                :: success
   logical, save                          :: firstCall = .true.
   logical                                :: wantDebug
172
173
174
175
   integer(kind=ik)                       :: istat
   character(200)                         :: errorMessage
   logical                                :: useGPU
   integer(kind=ik)                       :: numberOfGPUDevices
Andreas Marek's avatar
Andreas Marek committed
176

177
#ifdef HAVE_DETAILED_TIMINGS
178
    call timer%start("solve_evp_real_2stage")
179
#endif
180
181
    call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
    call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
182

183
184
185
186
    call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
    call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
    call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
    call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
187

188

189
190
191
192
193
194
    wantDebug = .false.
    if (firstCall) then
      ! are debug messages desired?
      wantDebug = debug_messages_via_environment_variable()
      firstCall = .false.
    endif
195

196
    success = .true.
197

198
199
    useQRActual = .false.
    useGPU      = .false.
200

201
202
203
204
205
    ! set usage of qr decomposition via API call
    if (present(useQR)) then
      if (useQR) useQRActual = .true.
        if (.not.(useQR)) useQRACtual = .false.
    endif
206

207
208
209
210
    ! overwrite this with environment variable settings
    if (qr_decomposition_via_environment_variable(useQREnvironment)) then
      useQRActual = useQREnvironment
    endif
211

212
213
214
215
216
217
218
219
220
221
    if (useQRActual) then
      if (mod(na,nblk) .ne. 0) then
        if (wantDebug) then
          write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
        endif
        print *, "Do not use QR-decomposition for this matrix and blocksize."
        success = .false.
        return
      endif
    endif
222

223

224
225
226
227
    if (present(THIS_REAL_ELPA_KERNEL_API)) then
      ! user defined kernel via the optional argument in the API call
      THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
    else
228

229
230
231
232
      ! if kernel is not choosen via api
      ! check whether set by environment variable
      THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
    endif
Andreas Marek's avatar
Andreas Marek committed
233

234
235
236
237
238
239
240
241
242
243
244
245
246
247
    ! check whether choosen kernel is allowed
    if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then

      if (my_pe == 0) then
        write(error_unit,*) " "
        write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
        write(error_unit,*) "is not in the list of the allowed kernels!"
        write(error_unit,*) " "
        write(error_unit,*) "Allowed kernels are:"
        do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
          if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
            write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
          endif
        enddo
Andreas Marek's avatar
Andreas Marek committed
248

249
250
251
252
        write(error_unit,*) " "
        write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !"
      endif
      THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
253

254
255
256
    endif

    if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
257
      if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
258
259
260
261
262
263
        useGPU = .true.
      endif
      if (nblk .ne. 128) then
        print *,"At the moment GPU version needs blocksize 128"
        stop
      endif
264

265
266
267
268
269
270
271
      ! set the neccessary parameters
      cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
      cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
      cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
      cudaHostRegisterPortable = cuda_hostRegisterPortable()
      cudaHostRegisterMapped   = cuda_hostRegisterMapped()
    endif
272

273
    ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
274
275
276
277
    ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
    ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
    ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
    ! on this and maybe allow a run-time optimization here
278
279
280
    if (useGPU) then
      nbw = nblk
    else
281
      nbw = (63/nblk+1)*nblk
282
    endif
283

284
    num_blocks = (na-1)/nbw + 1
285

286
287
288
289
290
    allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
      print *,"solve_evp_real_2stage: error when allocating tmat "//errorMessage
      stop
    endif
291

292
    ! Reduction full -> band
293

294
295
296
297
    ttt0 = MPI_Wtime()
    ttts = ttt0
    call bandred_real(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
                      tmat, wantDebug, useGPU, success, useQRActual)
298

299
300
301
302
    if (.not.(success)) return
    ttt1 = MPI_Wtime()
    if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
303

304
     ! Reduction band -> tridiagonal
305

306
307
308
309
310
     allocate(e(na), stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when allocating e "//errorMessage
       stop
     endif
311

312
     ttt0 = MPI_Wtime()
313
     call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, &
314
                          mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
315
316
317
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
318

319
#ifdef DOUBLE_PRECISION_REAL
320
321
     call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
     call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
322
323
324
325
#else
     call mpi_bcast(ev,na,MPI_REAL4,0,mpi_comm_all,mpierr)
     call mpi_bcast(e,na,MPI_REAL4,0,mpi_comm_all,mpierr)
#endif
326

327
328
     ttt1 = MPI_Wtime()
     time_evp_fwd = ttt1-ttts
329

330
     ! Solve tridiagonal system
331

332
333
334
335
336
337
338
     ttt0 = MPI_Wtime()
     call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
                      mpi_comm_cols, wantDebug, success)
     if (.not.(success)) return

     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
339
     write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
340
341
     time_evp_solve = ttt1-ttt0
     ttts = ttt1
342

343
344
345
346
347
348
349
350
     deallocate(e, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when deallocating e "//errorMessage
       stop
     endif
     ! Backtransform stage 1

     ttt0 = MPI_Wtime()
351
     call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, &
352
                                    mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success,      &
353
                                    THIS_REAL_ELPA_KERNEL)
354
355
356
357
     if (.not.(success)) return
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
358

359
360
361
362
363
364
     ! We can now deallocate the stored householder vectors
     deallocate(hh_trans_real, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when deallocating hh_trans_real "//errorMessage
       stop
     endif
365
366


367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
     ! Backtransform stage 2
     print *,"useGPU== ",useGPU
     ttt0 = MPI_Wtime()
     call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, &
                                     mpi_comm_cols, useGPU, useQRActual)
     ttt1 = MPI_Wtime()
     if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
     time_evp_back = ttt1-ttts

     deallocate(tmat, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_real_2stage: error when deallocating tmat"//errorMessage
       stop
     endif
382

383
#ifdef HAVE_DETAILED_TIMINGS
384
     call timer%stop("solve_evp_real_2stage")
385
#endif
386
1    format(a,f10.3)
387

388
   end function solve_evp_real_2stage
389

390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
!>  \brief solve_evp_complex_2stage: Fortran function to solve the complex eigenvalue problem with a 2 stage approach
!>
!>  Parameters
!>
!>  \param na                                   Order of matrix a
!>
!>  \param nev                                  Number of eigenvalues needed
!>
!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
!>                                              Distribution is like in Scalapack.
!>                                              The full matrix must be set (not only one half like in scalapack).
!>                                              Destroyed on exit (upper and lower half).
!>
!>  \param lda                                  Leading dimension of a
!>
!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
!>
!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
!>                                              Distribution is like in Scalapack.
!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
!>                                              even if only a part of the eigenvalues is needed.
!>
!>  \param ldq                                  Leading dimension of q
!>
!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
!>
!>  \param matrixCols                           local columns of matrix a and q
!>
!>  \param mpi_comm_rows                        MPI communicator for rows
!>  \param mpi_comm_cols                        MPI communicator for columns
!>  \param mpi_comm_all                         MPI communicator for the total processor set
!>
!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!>
!>  \result success                             logical, false if error occured
425
!-------------------------------------------------------------------------------
426
function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
427
                                  matrixCols, mpi_comm_rows, mpi_comm_cols,      &
428
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
429

430
#ifdef HAVE_DETAILED_TIMINGS
Andreas Marek's avatar
Andreas Marek committed
431
   use timings
432
#endif
Andreas Marek's avatar
Andreas Marek committed
433
   use precision
434
435
   use cuda_functions
   use mod_check_for_gpu
436
   use iso_c_binding
437
   implicit none
Andreas Marek's avatar
Andreas Marek committed
438
439
440
441
   integer(kind=ik), intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
   integer(kind=ik)                       :: THIS_COMPLEX_ELPA_KERNEL
   integer(kind=ik), intent(in)           :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
   complex(kind=ck), intent(inout)        :: a(lda,matrixCols), q(ldq,matrixCols)
442
443
   ! was
   ! complex a(lda,*), q(ldq,*)
Andreas Marek's avatar
Andreas Marek committed
444
445
446
447
448
449
450
   real(kind=rk), intent(inout)           :: ev(na)
   complex(kind=ck), allocatable          :: hh_trans_complex(:,:)

   integer(kind=ik)                       :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes
   integer(kind=ik)                       :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
   complex(kind=ck), allocatable          :: tmat(:,:,:)
   real(kind=rk), allocatable             :: q_real(:,:), e(:)
451
   real(kind=c_double)                    :: ttt0, ttt1, ttts  ! MPI_WTIME always needs double
Andreas Marek's avatar
Andreas Marek committed
452
453
454
455
   integer(kind=ik)                       :: i

   logical                                :: success, wantDebug
   logical, save                          :: firstCall = .true.
456
457
458
459
   integer(kind=ik)                       :: istat
   character(200)                         :: errorMessage
   logical                                :: useGPU
   integer(kind=ik)                       :: numberOfGPUDevices
Andreas Marek's avatar
Andreas Marek committed
460

461
#ifdef HAVE_DETAILED_TIMINGS
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
    call timer%start("solve_evp_complex_2stage")
#endif
    call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
    call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

    call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
    call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
    call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
    call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)

    useGPU = .false.
    wantDebug = .false.
    if (firstCall) then
      ! are debug messages desired?
      wantDebug = debug_messages_via_environment_variable()
      firstCall = .false.
    endif
479

480

481
    success = .true.
482

483
484
485
486
487
488
489
490
    if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then
      ! user defined kernel via the optional argument in the API call
      THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API
    else
      ! if kernel is not choosen via api
      ! check whether set by environment variable
      THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel()
    endif
491

492
493
494
495
496
497
498
499
500
501
502
503
504
505
    ! check whether choosen kernel is allowed
    if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then

      if (my_pe == 0) then
        write(error_unit,*) " "
        write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL)
        write(error_unit,*) "is not in the list of the allowed kernels!"
        write(error_unit,*) " "
        write(error_unit,*) "Allowed kernels are:"
        do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
          if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then
            write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
          endif
        enddo
506

507
508
509
510
511
        write(error_unit,*) " "
        write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
      endif
      THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
    endif
512

513
    if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
514
      if (check_for_gpu(my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then
515
516
517
518
519
520
        useGPU=.true.
      endif
      if (nblk .ne. 128) then
        print *,"At the moment GPU version needs blocksize 128"
        stop
      endif
Andreas Marek's avatar
Andreas Marek committed
521

522
523
524
525
526
527
528
      ! set the neccessary parameters
      cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
      cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
      cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
      cudaHostRegisterPortable = cuda_hostRegisterPortable()
      cudaHostRegisterMapped   = cuda_hostRegisterMapped()
    endif
529

530
    ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
531

532
    nbw = (31/nblk+1)*nblk
533

534
535
536
537
538
539
540
541
    num_blocks = (na-1)/nbw + 1

    allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
      print *,"solve_evp_complex_2stage: error when allocating tmat"//errorMessage
      stop
    endif
    ! Reduction full -> band
542

543
544
545
546
547
    ttt0 = MPI_Wtime()
    ttts = ttt0
    call bandred_complex(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
                         tmat, wantDebug, useGPU, success)
    if (.not.(success)) then
548
#ifdef HAVE_DETAILED_TIMINGS
549
      call timer%stop()
550
#endif
551
552
553
554
555
      return
    endif
    ttt1 = MPI_Wtime()
    if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time bandred_complex               :',ttt1-ttt0
556

557
    ! Reduction band -> tridiagonal
558

559
560
561
562
563
    allocate(e(na), stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
      print *,"solve_evp_complex_2stage: error when allocating e"//errorMessage
      stop
    endif
564

565

566
    ttt0 = MPI_Wtime()
567
568
   call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_complex, &
                             mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
569
570
571
    ttt1 = MPI_Wtime()
    if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time tridiag_band_complex          :',ttt1-ttt0
572

573
574
575
576
577
578
579
#ifdef DOUBLE_PRECISION_COMPLEX
    call mpi_bcast(ev, na, mpi_real8, 0, mpi_comm_all, mpierr)
    call mpi_bcast(e, na, mpi_real8, 0, mpi_comm_all, mpierr)
#else
    call mpi_bcast(ev, na, mpi_real4, 0, mpi_comm_all, mpierr)
    call mpi_bcast(e, na, mpi_real4, 0, mpi_comm_all, mpierr)
#endif
580
581
    ttt1 = MPI_Wtime()
    time_evp_fwd = ttt1-ttts
582

583
584
585
    l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
    l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
    l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev
586

587
588
589
590
591
    allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
      print *,"solve_evp_complex_2stage: error when allocating q_real"//errorMessage
      stop
    endif
592

593
    ! Solve tridiagonal system
594

595
596
597
598
    ttt0 = MPI_Wtime()
    call solve_tridi(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, &
                     mpi_comm_rows, mpi_comm_cols, wantDebug, success)
    if (.not.(success)) return
599

600
601
602
603
604
    ttt1 = MPI_Wtime()
    if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times)  &
       write(error_unit,*) 'Time solve_tridi                   :',ttt1-ttt0
    time_evp_solve = ttt1-ttt0
    ttts = ttt1
605

606
    q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
607

608
609
610
611
612
    deallocate(e, q_real, stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
      print *,"solve_evp_complex_2stage: error when deallocating e, q_real"//errorMessage
      stop
    endif
613

614

615
    ! Backtransform stage 1
616

617
618
    ttt0 = MPI_Wtime()
    call trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq,  &
619
620
                                       matrixCols, hh_trans_complex, &
                                       mpi_comm_rows, mpi_comm_cols, &
621
                                       wantDebug, useGPU, success,THIS_COMPLEX_ELPA_KERNEL)
622
623
624
625
    if (.not.(success)) return
    ttt1 = MPI_Wtime()
    if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0
626

627
628
629
630
631
632
    ! We can now deallocate the stored householder vectors
    deallocate(hh_trans_complex, stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
      print *,"solve_evp_complex_2stage: error when deallocating hh_trans_complex"//errorMessage
      stop
    endif
633
634


635

636
    ! Backtransform stage 2
637

638
    ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
639
   call trans_ev_band_to_full_complex(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, &
640
                                      mpi_comm_rows, mpi_comm_cols, useGPU)
641
642
643
644
    ttt1 = MPI_Wtime()
    if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
       write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0
    time_evp_back = ttt1-ttts
645

646
647
648
649
650
    deallocate(tmat, stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
      print *,"solve_evp_complex_2stage: error when deallocating tmat "//errorMessage
      stop
    endif
651

652
#ifdef HAVE_DETAILED_TIMINGS
653
    call timer%stop("solve_evp_complex_2stage")
654
#endif
655

656
1   format(a,f10.3)
657

658
end function solve_evp_complex_2stage
659
660

end module ELPA2