elpa2.F90 324 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
!    http://elpa.rzg.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".


#include "config-f90.h"

module ELPA2

! Version 1.1.2, 2011-02-21

65
  use elpa_utilities
66
  USE ELPA1
67
  use elpa2_utilities
68
69
  use elpa_pdgeqrf

70
71
72
73
74
#ifdef WITH_GPU_VERSION
  use cuda_routines
  use cuda_c_kernel
  use iso_c_binding
#endif
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
  implicit none

  PRIVATE ! By default, all routines contained are private

  ! The following routines are public:

  public :: solve_evp_real_2stage
  public :: solve_evp_complex_2stage

  public :: bandred_real
  public :: tridiag_band_real
  public :: trans_ev_tridi_to_band_real
  public :: trans_ev_band_to_full_real

  public :: bandred_complex
  public :: tridiag_band_complex
  public :: trans_ev_tridi_to_band_complex
  public :: trans_ev_band_to_full_complex
93

94
95
96
97
98
99
  public :: band_band_real
  public :: divide_band

  integer, public :: which_qr_decomposition = 1     ! defines, which QR-decomposition algorithm will be used
                                                    ! 0 for unblocked
                                                    ! 1 for blocked (maxrank: nblk)
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
!-------------------------------------------------------------------------------

  ! The following array contains the Householder vectors of the
  ! transformation band -> tridiagonal.
  ! It is allocated and set in tridiag_band_real and used in
  ! trans_ev_tridi_to_band_real.
  ! It must be deallocated by the user after trans_ev_tridi_to_band_real!

  real*8, allocatable :: hh_trans_real(:,:)
  complex*16, allocatable :: hh_trans_complex(:,:)

!-------------------------------------------------------------------------------

  include 'mpif.h'


!******
contains
118

119
function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk,        &
120
                               matrixCols,                               &
121
122
123
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
                                 useQR) result(success)
124
125
126
127
128
129
130
131
132
133

!-------------------------------------------------------------------------------
!  solve_evp_real_2stage: Solves the real eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
134
!  a(lda,matrixCols)    Distributed matrix for which eigenvalues are to be computed.
135
136
137
138
139
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
140
!  matrixCols  local columns of matrix a and q
141
142
143
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
144
!  q(ldq,matrixCols)    On output: Eigenvectors of a
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
160
161
162
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
163
   implicit none
164
165
   logical, intent(in), optional :: useQR
   logical                       :: useQRActual, useQREnvironment
Andreas Marek's avatar
Andreas Marek committed
166
   integer, intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
167
   integer                       :: THIS_REAL_ELPA_KERNEL
168

169
   integer, intent(in)           :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, &
170
                                    mpi_comm_cols, mpi_comm_all
171
   integer, intent(in)           :: nblk
172
   real*8, intent(inout)         :: a(lda,matrixCols), ev(na), q(ldq,matrixCols)
173

174
175
176
177
178
179
   integer                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer                       :: nbw, num_blocks
   real*8, allocatable           :: tmat(:,:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
   logical                       :: success
180
181
   logical, save                 :: firstCall = .true.
   logical                       :: wantDebug
182
183
   integer                       :: istat
   character(200)                :: errorMessage
184

Andreas Marek's avatar
Andreas Marek committed
185
186
187
188
189
190
191
#ifdef WITH_GPU_VERSION
   if (nblk .ne. 128) then
     print *,"At the moment GPU version needs blocksize 128"
     stop
   endif
#endif

192
193
194
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_real_2stage")
#endif
195
196
197
198
199
200
201
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
202

203
204
205
206
207
208
209
210

   wantDebug = .false.
   if (firstCall) then
     ! are debug messages desired?
     wantDebug = debug_messages_via_environment_variable()
     firstCall = .false.
   endif

211
212
   success = .true.

213
214
215
216
217
218
219
220
221
222
223
224
225
   useQRActual = .false.

   ! set usage of qr decomposition via API call
   if (present(useQR)) then
     if (useQR) useQRActual = .true.
     if (.not.(useQR)) useQRACtual = .false.
   endif

   ! overwrite this with environment variable settings
   if (qr_decomposition_via_environment_variable(useQREnvironment)) then
     useQRActual = useQREnvironment
   endif

226
   if (useQRActual) then
227
228
229
230
     if (mod(na,nblk) .ne. 0) then
       if (wantDebug) then
         write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
       endif
Andreas Marek's avatar
Andreas Marek committed
231
     print *, "Do not use QR-decomposition for this matrix and blocksize."
Andreas Marek's avatar
Andreas Marek committed
232
233
     success = .false.
     return
234
     endif
235
236
   endif

237

238
239
240
   if (present(THIS_REAL_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
241
   else
242

243
244
245
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
Andreas Marek's avatar
Andreas Marek committed
246
247
248
249
   endif

   ! check whether choosen kernel is allowed
   if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
250

251
252
253
254
255
256
257
258
259
260
261
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
262

263
264
265
266
       write(error_unit,*) " "
       write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !"
     endif
     THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
267
268

   endif
269
270
271

   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

272
273
274
#ifdef WITH_GPU_VERSION
   nbw = nblk
#else
275
   nbw = (31/nblk+1)*nblk
276
#endif
277
278
   num_blocks = (na-1)/nbw + 1

279
280
281
282
283
   allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
   if (istat .ne. 0) then
     print *,"solve_evp_real_2stage: error when allocating tmat "//errorMessage
     stop
   endif
284
285
286
287
288

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
289
   call bandred_real(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
290
                     tmat, wantDebug, success, useQRActual)
291
   if (.not.(success)) return
292
   ttt1 = MPI_Wtime()
293
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
294
      write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
295
296
297

   ! Reduction band -> tridiagonal

298
299
300
301
302
303
   allocate(e(na), stat=istat, errmsg=errorMessage)
   if (istat .ne. 0) then
     print *,"solve_evp_real_2stage: error when allocating e "//errorMessage
     stop
   endif

304
305

   ttt0 = MPI_Wtime()
306
   call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, matrixCols, mpi_comm_rows, &
Andreas Marek's avatar
Andreas Marek committed
307
                          mpi_comm_cols, mpi_comm_all)
308
   ttt1 = MPI_Wtime()
309
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
310
      write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
311
312
313
314
315
316
317
318
319
320

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
321
   call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
322
                    mpi_comm_cols, wantDebug, success)
323
324
   if (.not.(success)) return

325
   ttt1 = MPI_Wtime()
326
327
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
     write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
328
329
330
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

331
332
333
334
335
   deallocate(e, stat=istat, errmsg=errorMessage)
   if (istat .ne. 0) then
     print *,"solve_evp_real_2stage: error when deallocating e "//errorMessage
     stop
   endif
336
337
338
   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
339
   call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, matrixCols, mpi_comm_rows, &
340
                                    mpi_comm_cols, wantDebug, success, THIS_REAL_ELPA_KERNEL)
341
   if (.not.(success)) return
342
   ttt1 = MPI_Wtime()
343
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
344
      write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
345
346

   ! We can now deallocate the stored householder vectors
347
348
349
350
351
352
   deallocate(hh_trans_real, stat=istat, errmsg=errorMessage)
   if (istat .ne. 0) then
     print *,"solve_evp_real_2stage: error when deallocating hh_trans_real "//errorMessage
     stop
   endif

353
354
355
356

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
357
   call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, &
358
                                   mpi_comm_cols, useQRActual)
359
   ttt1 = MPI_Wtime()
360
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
361
      write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
362
363
   time_evp_back = ttt1-ttts

364
365
366
367
368
369
   deallocate(tmat, stat=istat, errmsg=errorMessage)
   if (istat .ne. 0) then
     print *,"solve_evp_real_2stage: error when deallocating tmat"//errorMessage
     stop
   endif

370
371
372
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_real_2stage")
#endif
373
374
1  format(a,f10.3)

375
end function solve_evp_real_2stage
376
377
378
379
380

!-------------------------------------------------------------------------------

!-------------------------------------------------------------------------------

381
function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
382
                                  matrixCols, mpi_comm_rows, mpi_comm_cols,      &
383
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
384
385
386
387
388
389
390
391
392
393

!-------------------------------------------------------------------------------
!  solve_evp_complex_2stage: Solves the complex eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
394
!  a(lda,matrixCols)    Distributed matrix for which eigenvalues are to be computed.
395
396
397
398
399
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
400
!  matrixCols  local columns of matrix a and q
401
402
403
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
404
!  q(ldq,matrixCols)    On output: Eigenvectors of a
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
420
421
422
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
423
   implicit none
Andreas Marek's avatar
Andreas Marek committed
424
425
   integer, intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
   integer                       :: THIS_COMPLEX_ELPA_KERNEL
426
427
   integer, intent(in)           :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
   complex*16, intent(inout)     :: a(lda,matrixCols), q(ldq,matrixCols)
428
429
430
431
432
433
434
435
   real*8, intent(inout)         :: ev(na)

   integer                       :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes
   integer                       :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
   complex*16, allocatable       :: tmat(:,:,:)
   real*8, allocatable           :: q_real(:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
436

437
438
   logical                       :: success, wantDebug
   logical, save                 :: firstCall = .true.
439
440
441
   integer                       :: istat
   character(200)                :: errorMessage

442

Andreas Marek's avatar
Andreas Marek committed
443
444
445
446
447
448
449
#ifdef WITH_GPU_VERSION
   if (nblk .ne. 128) then
     print *,"At the moment GPU version needs blocksize 128"
     stop
   endif
#endif

450
451
452
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_complex_2stage")
#endif
Andreas Marek's avatar
Andreas Marek committed
453
454
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
455
456
457
458
459

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
460

461
462
463
464
465
466
467
468
   wantDebug = .false.
   if (firstCall) then
     ! are debug messages desired?
     wantDebug = debug_messages_via_environment_variable()
     firstCall = .false.
   endif


469
470
   success = .true.

471
472
473
   if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
474
   else
475
476
477
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel()
Andreas Marek's avatar
Andreas Marek committed
478
   endif
479

Andreas Marek's avatar
Andreas Marek committed
480
481
   ! check whether choosen kernel is allowed
   if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then
482

483
484
485
486
487
488
489
490
491
492
493
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
494

495
496
497
498
       write(error_unit,*) " "
       write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
     endif
     THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
499
   endif
500
501
502
503
504
505
   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

   nbw = (31/nblk+1)*nblk

   num_blocks = (na-1)/nbw + 1

506
507
508
509
510
   allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
   if (istat .ne. 0) then
     print *,"solve_evp_complex_2stage: error when allocating tmat"//errorMessage
     stop
   endif
511
512
513
514
   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
515
   call bandred_complex(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
516
                        tmat, wantDebug, success)
517
518
519
520
521
522
   if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
     call timer%stop()
#endif
     return
   endif
523
   ttt1 = MPI_Wtime()
524
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
525
      write(error_unit,*) 'Time bandred_complex               :',ttt1-ttt0
526
527
528

   ! Reduction band -> tridiagonal

529
530
531
532
533
534
   allocate(e(na), stat=istat, errmsg=errorMessage)
   if (istat .ne. 0) then
     print *,"solve_evp_complex_2stage: error when allocating e"//errorMessage
     stop
   endif

535
536

   ttt0 = MPI_Wtime()
537
   call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
538
   ttt1 = MPI_Wtime()
539
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
540
      write(error_unit,*) 'Time tridiag_band_complex          :',ttt1-ttt0
541
542
543
544
545
546
547
548
549
550
551

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
   l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev

552
553
554
555
556
   allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage)
   if (istat .ne. 0) then
     print *,"solve_evp_complex_2stage: error when allocating q_real"//errorMessage
     stop
   endif
557
558
559
560

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
561
   call solve_tridi(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, &
562
                    mpi_comm_rows, mpi_comm_cols, wantDebug, success)
563
564
   if (.not.(success)) return

565
   ttt1 = MPI_Wtime()
566
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times)  &
567
      write(error_unit,*) 'Time solve_tridi                   :',ttt1-ttt0
568
569
570
571
572
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)

573
574
575
576
577
578
   deallocate(e, q_real, stat=istat, errmsg=errorMessage))
   if (istat .ne. 0) then
     print *,"solve_evp_complex_2stage: error when deallocating e, q_real"//errorMessage
     stop
   endif

579
580
581
582

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
583
   call trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq,  &
584
                                       matrixCols, mpi_comm_rows, mpi_comm_cols,&
585
                                       wantDebug, success,THIS_COMPLEX_ELPA_KERNEL)
586
   if (.not.(success)) return
587
   ttt1 = MPI_Wtime()
588
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
589
      write(error_unit,*) 'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0
590
591

   ! We can now deallocate the stored householder vectors
592
593
594
595
596
597
598
   deallocate(hh_trans_complex, stat=istat, errmsg=errorMessage)
   if (istat .ne. 0) then
     print *,"solve_evp_complex_2stage: error when deallocating hh_trans_complex"//errorMessage
     stop
   endif


599
600
601
602

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
603
   call trans_ev_band_to_full_complex(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols)
604
   ttt1 = MPI_Wtime()
605
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
606
      write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0
607
608
   time_evp_back = ttt1-ttts

609
610
611
612
613
614
   deallocate(tmat, stat=istat, errmsg=errorMessage)
   if (istat .ne. 0) then
     print *,"solve_evp_complex_2stage: error when deallocating tmat "//errorMessage
     stop
   endif

615
616
617
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_complex_2stage")
#endif
618
619
620

1  format(a,f10.3)

621
end function solve_evp_complex_2stage
622
623
624

!-------------------------------------------------------------------------------

625

626
subroutine bandred_real(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
627
                        tmat, wantDebug, success, useQR)
628
629
630
631
632
633
634
635

!-------------------------------------------------------------------------------
!  bandred_real: Reduces a distributed symmetric matrix to band form
!
!  Parameters
!
!  na          Order of matrix
!
636
!  a(lda,matrixCols)    Distributed matrix which should be reduced.
637
638
639
640
641
642
!              Distribution is like in Scalapack.
!              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
!              a(:,:) is overwritten on exit with the band and the Householder vectors
!              in the upper half.
!
!  lda         Leading dimension of a
643
!  matrixCols  local columns of matrix a
644
645
646
647
648
649
650
651
652
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  nbw         semi bandwith of output matrix
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!
653
!  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
654
655
656
!              Factors for the Householder vectors (returned), needed for back transformation
!
!-------------------------------------------------------------------------------
657
658
659
660

#ifdef WITH_GPU_VERSION
  use cuda_routines
  use iso_c_binding
661
#endif
662

663
664
665
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
666

667
   implicit none
668

669
670
   integer             :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
   real*8              :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
671

672
673
674
675
676
   integer             :: my_prow, my_pcol, np_rows, np_cols, mpierr
   integer             :: l_cols, l_rows
   integer             :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
   integer             :: istep, ncol, lch, lcx, nlc
   integer             :: tile_size, l_rows_tile, l_cols_tile
677

678
679
680
681
#ifdef WITH_GPU_VERSION
   real*8              :: eps
#endif

682
   real*8              :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
683

684
685
686
687
#ifdef WITH_GPU_VERSION
   real*8, allocatable :: tmp(:), vr(:), vmr(:), umc(:)
#else

688
   real*8, allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:)
689
#endif
690

691
692
693
694
695
   ! needed for blocked QR decomposition
   integer             :: PQRPARAM(11), work_size
   real*8              :: dwork_size(1)
   real*8, allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)

696
697
698
#ifdef WITH_GPU_VERSION
   integer(C_SIZE_T)   :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
   integer, external   :: numroc
699
   integer             :: ierr
700
701
702
   integer             :: cur_l_rows, cur_l_cols, vmr_size, umc_size
   integer(C_SIZE_T)   :: lc_start, lc_end
   integer             :: lr_end
Andreas Marek's avatar
Andreas Marek committed
703
   integer             :: na_rows, na_cols
704
#endif
705
   logical, intent(in) :: wantDebug
706
707
   logical, intent(out):: success

708
   logical, intent(in) :: useQR
709
710
   integer             :: istat
   character(200)      :: errorMessage
711

712
713
714
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("bandred_real")
#endif
715
716
717
718
   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
719
   success = .true.
720
721


722
   ! Semibandwith nbw must be a multiple of blocksize nblk
723
724
   if (mod(nbw,nblk)/=0) then
     if (my_prow==0 .and. my_pcol==0) then
725
726
727
728
       if (wantDebug) then
         write(error_unit,*) 'ELPA2_bandred_real: ERROR: nbw=',nbw,', nblk=',nblk
         write(error_unit,*) 'ELPA2_bandred_real: ELPA2 works only for nbw==n*nblk'
       endif
729
       success = .false.
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
730
       return
731
     endif
732
733
   endif

Andreas Marek's avatar
Andreas Marek committed
734
735
736
737
738
#ifdef WITH_GPU_VERSION
   na_rows = numroc(na, nblk, my_prow, 0, np_rows)
   na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#endif

739
740
741
742
743
744
745
746
   ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

   tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
   tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

   l_rows_tile = tile_size/np_rows ! local rows of a tile
   l_cols_tile = tile_size/np_cols ! local cols of a tile

747
   if (useQR) then
748
749
750
751
#ifdef WITH_GPU_VERSION
     print *,"qr decomposition at the moment not supported with GPU"
     stop
#else
752
753
     if (which_qr_decomposition == 1) then
       call qr_pqrparam_init(pqrparam,    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
754
755
       allocate(tauvector(na), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
756
         print *,"bandred_real: error when allocating tauvector "//errorMessage
757
758
759
760
761
         stop
       endif

       allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
762
         print *,"bandred_real: error when allocating blockheuristic "//errorMessage
763
764
765
         stop
       endif

766
       l_rows = local_index(na, my_prow, np_rows, nblk, -1)
767
768
       allocate(vmr(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
769
         print *,"bandred_real: error when allocating vmr "//errorMessage
770
771
         stop
       endif
772

773
       call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector, tmat(1,1,1), nbw, dwork_size(1), -1, na, &
774
                             nbw, nblk, nblk, na, na, 1, 0, PQRPARAM, mpi_comm_rows, mpi_comm_cols, blockheuristic)
775
       work_size = dwork_size(1)
776
777
       allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
778
         print *,"bandred_real: error when allocating work_blocked "//errorMessage
779
780
         stop
       endif
781

782
       work_blocked = 0.0d0
783
784
       deallocate(vmr, stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
785
         print *,"bandred_real: error when deallocating vmr "//errorMessage
786
787
788
         stop
       endif

789
     endif
790
#endif
Andreas Marek's avatar
Andreas Marek committed
791
   endif ! useQr
792

793
794
795
#ifdef WITH_GPU_VERSION
   ! Here we convert the regular host array into a pinned host array
   istat = cuda_malloc(a_dev, lda*na_cols*8_8)
796
   if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
797
     print *,"bandred_real: error in cudaMalloc"
798
799
800
     stop
   endif

801
   istat = cuda_malloc(tmat_dev, nbw*nbw*8_8)
802
   if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
803
     print *,"bandred_real: error in cudaMalloc"
804
805
806
     stop
   endif

807
   istat = cuda_malloc(vav_dev, nbw*nbw*8_8)
808
   if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
809
     print *,"bandred_real: error in cudaMalloc"
810
811
     stop
   endif
812
813
814
815
816

   cur_l_rows = 0
   cur_l_cols = 0

   istat = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)*8_8,cudaMemcpyHostToDevice)
817
   if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
818
     print *,"bandred_real: error in cudaMemcpy"
819
820
821
     stop
   endif

822
823
#endif

824
825
   do istep = (na-1)/nbw, 1, -1

826
     n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
827

828
829
830
     ! Number of local columns/rows of remaining matrix
     l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
     l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
831

832
833
834
835
836
837
838
839
840
#ifdef WITH_GPU_VERSION
     cur_l_rows = max(l_rows, 1)
     cur_l_cols = max(l_cols, 1)

     vmr_size = cur_l_rows * 2 * n_cols
     umc_size = cur_l_cols * 2 * n_cols

     ! Allocate vmr and umc only if the inew size exceeds their current capacity
     ! Added for FORTRAN CALLS
841
     if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
842
843
844
       if (allocated(vr)) then
         deallocate(vr, stat=istat, errmsg=errorMessage)
         if (istat .ne. 0) then
845
           print *,"bandred_real: error when deallocating vr "//errorMessage
846
847
848
849
850
           stop
         endif
       endif
       allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
851
         print *,"bandred_real: error when allocating vr "//errorMessage
852
853
854
         stop
       endif

855
856
     endif

857
     if ((.not. allocated(vmr)) .or. (vmr_size .gt. ubound(vmr, dim=1))) then
858
859
860
       if (allocated(vmr)) then
         deallocate(vmr, stat=istat, errmsg=errorMessage)
         if (istat .ne. 0) then
861
           print *,"bandred_real: error when allocating vmr "//errorMessage
862
           stop
863
         endif
864
865
866

         istat = cuda_free(vmr_dev)
         if (istat .ne. 0) then
867
           print *,"bandred_real: error in cuda_free"
868
869
870
871
872
873
           stop
         endif
       endif

       allocate(vmr(vmr_size), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
874
         print *,"bandred_real: error when allocating vmr "//errorMessage
875
876
877
878
879
         stop
       endif

       istat = cuda_malloc(vmr_dev, vmr_size*8_8)
       if (istat .ne. 0) then
880
         print *,"bandred_real: error in cudaMalloc"
881
882
883
         stop
       endif

884
885
886
887
     endif



888
     if ((.not. allocated(umc)) .or. (umc_size .gt. ubound(umc, dim=1))) then
889
890
891
       if (allocated(umc)) then
         deallocate(umc, stat=istat, errmsg=errorMessage)
         if (istat .ne. 0) then
892
           print *,"bandred_real: error when deallocating umc "//errorMessage
893
894
895
896
897
           stop
         endif

         istat = cuda_free(umc_dev)
         if (istat .ne. 0) then
898
            print *,"bandred_real: error in cudaFree"
899
            stop
900
         endif
901
902
903
904
905

       endif

       allocate(umc(umc_size), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
906
         print *,"bandred_real: error when deallocating umc "//errorMessage
907
908
909
910
911
         stop
       endif

       istat = cuda_malloc(umc_dev, umc_size*8_8)
       if (istat .ne. 0) then
912
         print *,"bandred_real: error in cudaMalloc"
913
914
915
         stop
       endif

916
917
     endif
#else
918
     ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
919

920
921
     allocate(vmr(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
922
       print *,"bandred_real: error when allocating vmr "//errorMessage
923
924
925
926
927
       stop
     endif

     allocate(umc(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
928
       print *,"bandred_real: error when allocating umc "//errorMessage
929
930
931
932
933
       stop
     endif

     allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
934
       print *,"bandred_real: error when allocating vr "//errorMessage
935
936
       stop
     endif
937

938
#endif
939

940
941
942
#ifdef WITH_GPU_VERSION
     vmr(1 : cur_l_rows * n_cols) = 0.
#else
943
     vmr(1:l_rows,1:n_cols) = 0.
944
#endif
945
946
     vr(:) = 0
     tmat(:,:,istep) = 0
947

948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
#ifdef WITH_GPU_VERSION
     umc(1 : umc_size) = 0.

     lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
     lc_end   = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
     lr_end   = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)

     if(lc_start .le. 0) lc_start = 1

     ! Here we assume that the processor grid and the block grid are aligned
     cur_pcol = pcol(istep*nbw+1, nblk, np_cols)

     if(my_pcol == cur_pcol) then

       istat = cuda_memcpy2d(loc(a(1, lc_start)), lda*8_8, (a_dev + ((lc_start-1) * lda*8_8)), lda*8_8, &
                            lr_end*8_8, (lc_end - lc_start+1), cudaMemcpyDeviceToHost)
964
965
966
967
968
       if (istat .ne. 0) then
         print *,"error in cudaMemcpy2d"
         stop
       endif

969
970
971
     endif
#endif

972
     ! Reduce current block to lower triangular form
973
974
975
976
977
978
979
980
981
982

     if (useQR) then
       if (which_qr_decomposition == 1) then
         call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), &
                                  tmat(1,1,istep), nbw, work_blocked,       &
                                  work_size, na, n_cols, nblk, nblk,        &
                                  istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                  0, PQRPARAM, mpi_comm_rows, mpi_comm_cols,&
                                  blockheuristic)
       endif
983
     else
984

985
       do lc = n_cols, 1, -1
986

987
988
         ncol = istep*nbw + lc ! absolute column number of householder vector
         nrow = ncol - nbw ! Absolute number of pivot row
989

990
991
         lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
         lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
992

993
         tau = 0
994

995
         if (nrow == 1) exit ! Nothing to do
996

997
         cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
998

999
         if (my_pcol==cur_pcol) then
1000

1001
1002
           ! Get vector to be transformed; distribute last element and norm of
           ! remaining elements to all procs in current column
1003

1004
           vr(1:lr) = a(1:lr,lch) ! vector to be transformed
1005

1006
           if (my_prow==prow(nrow, nblk, np_rows)) then
1007
1008
1009
1010
1011
1012
             aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
             aux1(2) = vr(lr)
           else
             aux1(1) = dot_product(vr(1:lr),vr(1:lr))
             aux1(2) = 0.
           endif
1013

1014
           call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
1015

1016
1017
           vnorm2 = aux2(1)
           vrl    = aux2(2)
1018

1019
           ! Householder transformation
1020

1021
           call hh_transform_real(vrl, vnorm2, xf, tau)
1022

1023
           ! Scale vr and store Householder vector for back transformation
1024

1025
           vr(1:lr) = vr(1:lr) * xf
1026
           if (my_prow==prow(nrow, nblk, np_rows)) then
1027
1028
1029
1030
1031
             a(1:lr-1,lch) = vr(1:lr-1)
             a(lr,lch) = vrl
             vr(lr) = 1.
           else
             a(1:lr,lch) = vr(1:lr)
1032
           endif
1033

1034
         endif
1035

1036
         ! Broadcast Householder vector and tau along columns
1037

1038
1039
         vr(lr+1) = tau
         call MPI_Bcast(vr,lr+1,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
1040
1041
1042
#ifdef WITH_GPU_VERSION
         vmr(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
#else
1043
         vmr(1:lr,lc) = vr(1:lr)
1044
#endif
1045
1046
         tau = vr(lr+1)
         tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
1047

1048
1049
         ! Transform remaining columns in current block with Householder vector
         ! Local dot product
1050

1051
         aux1 = 0
1052

1053
1054
1055
1056
1057
1058
1059
1060
         nlc = 0 ! number of local columns
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             nlc = nlc+1