elpa2_bandred_template.F90 63 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
63
64
65
66
    subroutine bandred_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
67
    (obj, na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
Andreas Marek's avatar
Andreas Marek committed
68
     tmat_dev, wantDebug, useGPU, success &
69
#if REALCASE == 1
70
     , useQR)
71
72
#endif
#if COMPLEXCASE == 1
73
     )
74
#endif
75

76
  !-------------------------------------------------------------------------------
77
  !  bandred_real/complex: Reduces a distributed symmetric matrix to band form
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
  !
  !  Parameters
  !
  !  na          Order of matrix
  !
  !  a(lda,matrixCols)    Distributed matrix which should be reduced.
  !              Distribution is like in Scalapack.
  !              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
  !              a(:,:) is overwritten on exit with the band and the Householder vectors
  !              in the upper half.
  !
  !  lda         Leading dimension of a
  !  matrixCols  local columns of matrix a
  !
  !  nblk        blocksize of cyclic distribution, must be the same in both directions!
  !
  !  nbw         semi bandwith of output matrix
  !
  !  mpi_comm_rows
  !  mpi_comm_cols
  !              MPI-Communicators for rows/columns
  !
  !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
  !              Factors for the Householder vectors (returned), needed for back transformation
  !
  !-------------------------------------------------------------------------------

      use cuda_functions
      use iso_c_binding
      use elpa1_compute
#ifdef WITH_OPENMP
      use omp_lib
#endif
      use precision
112
      use elpa_abstract_impl
113
      implicit none
114
#include "../general/precision_kinds.F90"
115
      class(elpa_abstract_impl_t), intent(inout) :: obj
116
117
118
      integer(kind=ik)                            :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols

#ifdef USE_ASSUMED_SIZE
119
      MATH_DATATYPE(kind=rck)                    :: a(lda,*), tmat(nbw,nbw,*)
120
#else
121
      MATH_DATATYPE(kind=rck)                    :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
122
#endif
123

124
#if REALCASE == 1
125
      real(kind=rk)                    :: eps
126
127
#endif
      logical, intent(in)                         :: useGPU
128

129
130
131
      integer(kind=ik)                            :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)                            :: l_cols, l_rows
#if REALCASE == 1
132
      integer(kind=ik)                            :: vmrCols
133
#endif
134
      integer(kind=ik)                            :: mynlc
135
136
137
      integer(kind=ik)                            :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow
      integer(kind=ik)                            :: istep, ncol, lch, lcx, nlc
      integer(kind=ik)                            :: tile_size, l_rows_tile, l_cols_tile
138

139
140
      real(kind=rk)                    :: vnorm2
      MATH_DATATYPE(kind=rck)                    :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
141

142
!      complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
143
144
145
      MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:),  vmrCUDA(:),  umcCUDA(:)
      MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
      MATH_DATATYPE(kind=rck), allocatable :: vr(:)
146
147
148
149

#if REALCASE == 1
      ! needed for blocked QR decomposition
      integer(kind=ik)                            :: PQRPARAM(11), work_size
150
151
      real(kind=rk)                    :: dwork_size(1)
      real(kind=rk), allocatable       :: work_blocked(:), tauvector(:), blockheuristic(:)
152
#endif
153
      ! a_dev is passed from bandred_real to trans_ev_band
154
      integer(kind=C_intptr_T)                    :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
155
#ifdef WITH_MPI
156
157
158
159
      integer(kind=ik), external                  :: numroc
#endif
      integer(kind=ik)                            :: ierr
      integer(kind=ik)                            :: cur_l_rows, cur_l_cols, vmr_size, umc_size
160
      integer(kind=c_intptr_t)                      :: lc_start, lc_end
161
#if COMPLEXCASE == 1
162
      integer(kind=c_intptr_t)                      :: lce_1, lcs_1, lre_1
163
164
165
166
167
#endif
      integer(kind=ik)                            :: lr_end
      integer(kind=ik)                            :: na_cols
#if COMPLEXCASE == 1
      integer(kind=ik)                            :: na_rows
168
169
#endif

170
171
172
173
174
      logical, intent(in)                         :: wantDebug
      logical, intent(out)                        :: success
      logical                                     :: successCUDA
      integer(kind=ik)                            :: istat
      character(200)                              :: errorMessage
175

176
177
178
179
180
#if REALCASE == 1
      logical, intent(in)                         :: useQR
#endif
      integer(kind=ik)                            :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
                                                    ii, pp, transformChunkSize
181
      integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
182
183
184
                                                                        &PRECISION&
                                                                        &_&
                                                                        &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
185

186
      call obj%timer%start("bandred_&
Andreas Marek's avatar
Andreas Marek committed
187
188
189
190
      &MATH_DATATYPE&
      &" // &
      &PRECISION_SUFFIX &
      )
191
      if (wantDebug) call obj%timer%start("mpi_communication")
192
193
194
195
196

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
Andreas Marek's avatar
Andreas Marek committed
197

198
      if (wantDebug) call obj%timer%stop("mpi_communication")
199
200
201
202
203
204
205
      success = .true.


      ! Semibandwith nbw must be a multiple of blocksize nblk
      if (mod(nbw,nblk)/=0) then
        if (my_prow==0 .and. my_pcol==0) then
          if (wantDebug) then
Andreas Marek's avatar
Andreas Marek committed
206
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Retab    
Andreas Marek committed
207
208
      &MATH_DATATYPE&
      &: ERROR: nbw=',nbw,', nblk=',nblk
Andreas Marek's avatar
Andreas Marek committed
209
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Retab    
Andreas Marek committed
210
211
      &MATH_DATATYPE&
      &: ELPA2 works only for nbw==n*nblk'
212
213
214
215
216
217
218
219
220
          endif
          success = .false.
          return
        endif
      endif

! na_rows in used nowhere; only na_cols
      if (useGPU) then
#ifdef WITH_MPI
221
#if COMPLEXCASE == 1
222
        na_rows = numroc(na, nblk, my_prow, 0, np_rows)
223
#endif
224
225
        na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
226
227
228
#if COMPLEXCASE == 1
         na_rows = na
#endif
229
        na_cols = na
Andreas Marek's avatar
Cleanup    
Andreas Marek committed
230
#endif /* WITH_MPI */
231
232

        ! Here we convert the regular host array into a pinned host array
233
        successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype)
234
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
235
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
236
237
                  &MATH_DATATYPE&
                  &: error in cudaMalloc a_dev 1"
Andreas Marek's avatar
Andreas Marek committed
238
          stop 1
239
240
        endif

241
        successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
242
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
243
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
244
245
                  &MATH_DATATYPE&
                  &: error in cudaMalloc tmat_dev 1"
Andreas Marek's avatar
Andreas Marek committed
246
          stop 1
247
248
        endif

249
        successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
250
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
251
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
252
253
                  &MATH_DATATYPE&
                  &: error in cudaMalloc vav_dev 1"
Andreas Marek's avatar
Andreas Marek committed
254
          stop 1
255
        endif
256
257
258
259
260
261
262
263
264
265
      endif ! useGPU

      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

      l_rows_tile = tile_size/np_rows ! local rows of a tile
      l_cols_tile = tile_size/np_cols ! local cols of a tile

266
#if REALCASE == 1
267
268
269
270
      if (useQR) then

        if (useGPU) then
          print *,"qr decomposition at the moment not supported with GPU"
Andreas Marek's avatar
Andreas Marek committed
271
          stop 1
272
273
274
        endif

        if (which_qr_decomposition == 1) then
275
          call qr_pqrparam_init(obj,pqrparam(1:11),    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
276
277
278
          allocate(tauvector(na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating tauvector "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
279
            stop 1
280
281
282
283
284
          endif

          allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating blockheuristic "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
285
            stop 1
286
287
288
289
290
291
          endif

          l_rows = local_index(na, my_prow, np_rows, nblk, -1)
          allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
292
            stop 1
293
294
295
296
297
          endif

          vmrCols = na

#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
298
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
299
300
               &PRECISION&
               &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
301
302
303
304
                                 nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
305
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
306
307
               &PRECISION&
               &(obj, a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
308
309
310
311
312
313
314
315
316
                                 vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
                                 nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)
#endif

          work_size = dwork_size(1)
          allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating work_blocked "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
317
            stop 1
318
          endif
319
          work_blocked = CONST_0_0
320
321
322
          deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
323
            stop 1
324
325
326
327
328
          endif

        endif ! which_qr_decomposition

      endif ! useQr
Andreas Marek's avatar
Andreas Marek committed
329
#endif /* REALCASE */
330
331
332
333
334

      if (useGPU) then

        cur_l_rows = 0
        cur_l_cols = 0
Andreas Marek's avatar
Andreas Marek committed
335

336
        successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
337
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
338
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
339
340
                  &MATH_DATATYPE&
                  &: error in cudaMemcpy a_dev 2"
Andreas Marek's avatar
Andreas Marek committed
341
          stop 1
342
        endif
343
344
345
346
347
348
349
350
351
352
353
      endif ! useGPU


      do istep = (na-1)/nbw, 1, -1

        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step

        ! Number of local columns/rows of remaining matrix
        l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
        l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)

354
355
        ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces

356
357
358
359
360
361
362
363
364
365
366
367
368
        if (useGPU) then
          cur_l_rows = max(l_rows, 1)
          cur_l_cols = max(l_cols, 1)

          vmr_size = cur_l_rows * 2 * n_cols
          umc_size = cur_l_cols * 2 * n_cols

          ! Allocate vmr and umc only if the inew size exceeds their current capacity
          ! Added for FORTRAN CALLS
          if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
            if (allocated(vr)) then
              deallocate(vr, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
369
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
370
371
                        &MATH_DATATYPE&
                        &: error when deallocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
372
                stop 1
373
374
375
376
              endif
            endif
            allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
            if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
377
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
378
379
                      &MATH_DATATYPE&
                      &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
380
              stop 1
381
382
383
384
385
386
387
388
            endif

          endif

          if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then
            if (allocated(vmrCUDA)) then
              deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
389
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
390
391
                        &MATH_DATATYPE&
                        &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
392
                stop 1
393
394
395
396
              endif

              successCUDA = cuda_free(vmr_dev)
              if (.not.(successCUDA)) then
397
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
398
                        &MATH_DATATYPE&: error in cuda_free vmr_dev 1"
Andreas Marek's avatar
Andreas Marek committed
399
                stop 1
400
401
402
403
              endif
            endif

            allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage)
404

405
            if (istat .ne. 0) then
406
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
407
408
                      &MATH_DATATYPE&
                      &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
409
              stop 1
410
            endif
411
            successCUDA = cuda_malloc(vmr_dev, vmr_size* size_of_datatype)
412
            if (.not.(successCUDA)) then
413
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
414
415
                      &MATH_DATATYPE&
                      &: error in cudaMalloc: vmr_dev2"
Andreas Marek's avatar
Andreas Marek committed
416
              stop 1
417
418
419
            endif

          endif
420
421
422
423
424

          if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then
            if (allocated(umcCUDA)) then
              deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
425
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
426
427
                        &MATH_DATATYPE&
                        &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
428
                stop 1
429
430
431
432
              endif

              successCUDA = cuda_free(umc_dev)
              if (.not.(successCUDA)) then
433
                 print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
434
435
                         &MATH_DATATYPE&
                         &: error in cudaFree umc_dev 1"
Andreas Marek's avatar
Andreas Marek committed
436
                 stop 1
437
438
439
              endif

            endif
440

441
            allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
442

443
            if (istat .ne. 0) then
444
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
445
446
                      &MATH_DATATYPE&
                      &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
447
              stop 1
448
449
            endif

450
            successCUDA = cuda_malloc(umc_dev, umc_size* size_of_datatype)
451
            if (.not.(successCUDA)) then
452
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
453
454
                      &MATH_DATATYPE&
                      &: error in cudaMalloc umc_dev 2"
Andreas Marek's avatar
Andreas Marek committed
455
              stop 1
456
            endif
457

458
          endif
459
460
461

        else ! GPU not used

462
463
464
          ! unify the the name vmr and vmrCPU, as well as vmrGPU
          ! the same for umcCPU and umcGPU
          ! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces
465
466
467

          allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
468
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
469
470
                     &MATH_DATATYPE&
                     &: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
471
            stop 1
472
473
          endif

474
475
          allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
476
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
477
478
                    &MATH_DATATYPE&
                    &: error when allocating umcCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
479
            stop 1
480
          endif
Andreas Marek's avatar
Andreas Marek committed
481

482
483
          allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
484
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
485
486
                    &MATH_DATATYPE&
                    &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
487
            stop 1
488
489
490
491
492
          endif

        endif ! use GPU

        if (useGPU) then
493
#if REALCASE == 1
494
          vmrCUDA(1 : cur_l_rows * n_cols) = CONST_0_0
495
496
#endif
#if COMPLEXCASE == 1
497
          vmrCUDA(1: cur_l_rows * n_cols) = CONST_COMPLEX_0_0
498
#endif
499
        else
500
#if REALCASE == 1
501
          vmrCPU(1:l_rows,1:n_cols) = CONST_0_0
502
503
504
505
506
507
508
#endif
#if COMPLEXCASE == 1
          vmrCPU(1:l_rows,1:n_cols) = CONST_COMPLEX_0_0
#endif
        endif ! useGPU

#if REALCASE == 1
509
510
        vr(:) = CONST_0_0
        tmat(:,:,istep) = CONST_0_0
511
512
513
514
515
#endif
#if COMPLEXCASE == 1
        vr(:) = CONST_COMPLEX_0_0
        tmat(:,:,istep) = CONST_COMPLEX_0_0
#endif
516
        if (useGPU) then
517
#if REALCASE == 1
518
          umcCUDA(1 : umc_size) = CONST_0_0
519
#endif
520
521
522
523
          lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
          lc_end   = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
          lr_end   = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)

524
          if (lc_start .le. 0) lc_start = 1
525
526
527
528

          ! Here we assume that the processor grid and the block grid are aligned
          cur_pcol = pcol(istep*nbw+1, nblk, np_cols)

529
530
          if (my_pcol == cur_pcol) then
            successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), &
Andreas Marek's avatar
Retab    
Andreas Marek committed
531
                                      int((lda*size_of_datatype),kind=c_intptr_t), &
532
533
534
535
                                            (a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )),      &
                                            int(lda*size_of_datatype,kind=c_intptr_t),              &
                                            int(lr_end*size_of_datatype,kind=c_intptr_t),           &
                                             int((lc_end - lc_start+1),kind=c_intptr_t),int(cudaMemcpyDeviceToHost,kind=c_int))
536

537

538
            if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
539
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
540
541
                      &MATH_DATATYPE&
                      &: error in cudaMemcpy2d"
Andreas Marek's avatar
Andreas Marek committed
542
              stop 1
543
544
545
546
547
            endif
          endif
        endif ! useGPU

        ! Reduce current block to lower triangular form
548
#if REALCASE == 1
549
550
551
552
        if (useQR) then
          if (which_qr_decomposition == 1) then
            vmrCols = 2*n_cols
#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
553
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
554
555
                 &PRECISION&
                 &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
556
557
558
559
560
561
562
                                   na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size,        &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
563
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
564
565
                 &PRECISION&
                 &(obj, a(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) ,   &
566
567
568
569
570
571
572
573
574
575
                                    max(l_rows,1), vmrCols, tauvector(1:na), na, &
                                     tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)
#endif
          endif

       else !useQR
576
#endif /* REALCASE == 1 */
577
578
         do lc = n_cols, 1, -1

579
           ncol = istep*nbw + lc ! absolute column number of householder Vector
580
581
582
583
584
585
586
587
588
589
590
591
592
           nrow = ncol - nbw ! Absolute number of pivot row

           lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
           lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number

           tau = 0

           if (nrow == 1) exit ! Nothing to do

           cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block

           if (my_pcol==cur_pcol) then

593
             ! Get Vector to be transformed; distribute last element and norm of
594
595
             ! remaining elements to all procs in current column

596
             vr(1:lr) = a(1:lr,lch) ! Vector to be transformed
597
598
599
600
601
602

             if (my_prow==prow(nrow, nblk, np_rows)) then
               aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
               aux1(2) = vr(lr)
             else
               aux1(1) = dot_product(vr(1:lr),vr(1:lr))
603
#if REALCASE == 1
604
               aux1(2) = CONST_0_0
605
606
607
608
#endif
#if COMPLEXCASE == 1
               aux1(2) = CONST_COMPLEX_0_0
#endif
609
610
611
             endif

#ifdef WITH_MPI
612
             if (wantDebug) call obj%timer%start("mpi_communication")
Andreas Marek's avatar
Andreas Marek committed
613
             call mpi_allreduce(aux1, aux2, 2, &
614
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
615
                                MPI_REAL_PRECISION, &
616
617
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
618
                                MPI_COMPLEX_PRECISION, &
619
#endif
Andreas Marek's avatar
Andreas Marek committed
620
                                MPI_SUM, mpi_comm_rows, mpierr)
621
             if (wantDebug) call obj%timer%stop("mpi_communication")
622
623
624

#else /* WITH_MPI */
              aux2 = aux1 ! this should be optimized
Andreas Marek's avatar
Andreas Marek committed
625
#endif
626
627
628
629
630

             vnorm2 = aux2(1)
             vrl    = aux2(2)

             ! Householder transformation
631
#if REALCASE == 1
Andreas Marek's avatar
Retab    
Andreas Marek committed
632
       call hh_transform_real_&
633
634
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Retab    
Andreas Marek committed
635
       call hh_transform_complex_&
636
#endif
Andreas Marek's avatar
Andreas Marek committed
637
             &PRECISION &
638
                              (obj, vrl, vnorm2, xf, tau, wantDebug)
639
             ! Scale vr and store Householder Vector for back transformation
640
641
642
643
644

             vr(1:lr) = vr(1:lr) * xf
             if (my_prow==prow(nrow, nblk, np_rows)) then
               a(1:lr-1,lch) = vr(1:lr-1)
               a(lr,lch) = vrl
645
#if REALCASE == 1
646
               vr(lr) = CONST_1_0
647
648
649
650
#endif
#if COMPLEXCASE == 1
               vr(lr) = CONST_COMPLEX_1_0
#endif
651
652
653
654
655
656
             else
               a(1:lr,lch) = vr(1:lr)
             endif

           endif

657
           ! Broadcast Householder Vector and tau along columns
658
659
660

           vr(lr+1) = tau
#ifdef WITH_MPI
661
           if (wantDebug) call obj%timer%start("mpi_communication")
662
           call MPI_Bcast(vr, lr+1, &
663
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
664
                          MPI_REAL_PRECISION, &
665
666
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
667
                          MPI_COMPLEX_PRECISION, &
668
#endif
Andreas Marek's avatar
Andreas Marek committed
669
                          cur_pcol, mpi_comm_cols, mpierr)
670
           if (wantDebug) call obj%timer%stop("mpi_communication")
671
672

#endif /* WITH_MPI */
673

674
675
           if (useGPU) then
             vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
676
677
678
           else
             vmrCPU(1:lr,lc) = vr(1:lr)
           endif
679
680
           tau = vr(lr+1)

681
682
683
684
685
686
#if REALCASE == 1
           tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
#endif
#if COMPLEXCASE == 1
           tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
#endif
687
           ! Transform remaining columns in current block with Householder Vector
688
689
           ! Local dot product

690
#if REALCASE == 1
691
           aux1 = 0
692
693
694
695
#endif
#if COMPLEXCASE == 1
          aux1 = CONST_COMPLEX_0_0
#endif
696
697

#ifdef WITH_OPENMP
698
699
700
#if 0
 ! original complex implementation without openmp. check performance
            nlc = 0 ! number of local columns
701
702
703
704
705
706
707
708
709
710
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
711
           if (wantDebug) call obj%timer%start("mpi_communication")
712
713
714
715
716
717
718
719
720
721
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
Andreas Marek's avatar
Andreas Marek committed
722

723
724
725
             endif
           enddo

726

727
           if (wantDebug) call obj%timer%stop("mpi_communication")
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752

#else /* WITH_MPI */
!          if (nlc>0) aux2=aux1

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
             endif
           enddo

#endif /* WITH_MPI */
!
!           ! Transform
!
!           nlc = 0
!           do j=1,lc-1
!             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
!             if (lcx>0) then
!               nlc = nlc+1
!               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
753

754
755
!             endif
!           enddo
756
#endif /* if 0 */
757

758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
           !Open up one omp region to avoid paying openmp overhead.
           !This does not help performance due to the addition of two openmp barriers around the MPI call,
           !But in the future this may be beneficial if these barriers are replaced with a faster implementation

           !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
           mynlc = 0 ! number of local columns

           !This loop does not have independent iterations,
           !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
           !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
           !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
           !the iteration number modulo the number of threads
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0 ) then
               mynlc = mynlc+1
               if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
                   if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
               endif
             endif
           enddo

           ! Get global dot products

           !$omp barrier
           !$omp single
#ifdef WITH_MPI
785
           if (wantDebug) call obj%timer%start("mpi_communication")
786
787
           if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc,     &
#if REALCASE == 1
Andreas Marek's avatar
Retab    
Andreas Marek committed
788
                                     MPI_REAL_PRECISION,    &
789
790
791
792
#endif
#if COMPLEXCASE == 1
                                           MPI_COMPLEX_PRECISION, &
#endif
Andreas Marek's avatar
Retab    
Andreas Marek committed
793
                         MPI_SUM, mpi_comm_rows, mpierr)
794
           if (wantDebug) call obj%timer%stop("mpi_communication")
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
#else /* WITH_MPI */
           if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
           !$omp end single
           !$omp barrier

           ! Transform
           transformChunkSize=32
           mynlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               mynlc = mynlc+1
               !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
               !However, for some reason this is slower than doing it manually, so it is parallelized as below.
               do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
                  do pp = 1,transformChunkSize
                      if (pp + ii > lr) exit
#if REALCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
#endif
#if COMPLEXCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp)
#endif
                  enddo
               enddo
             endif
           enddo
           !$omp end parallel
824
825
826
827
828
829
830
831
832
833
834
835
836
837

#else /* WITH_OPENMP */

           nlc = 0 ! number of local columns
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
838
           if (wantDebug) call obj%timer%start("mpi_communication")
839
840
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc,      &
#if REALCASE == 1
Andreas Marek's avatar
Retab    
Andreas Marek committed
841
                                   MPI_REAL_PRECISION,   &
842
843
844
#endif
#if COMPLEXCASE == 1
                                         MPI_COMPLEX_PRECISION,&
845
#endif
Andreas Marek's avatar
Retab    
Andreas Marek committed
846
           MPI_SUM, mpi_comm_rows, mpierr)
847
           if (wantDebug) call obj%timer%stop("mpi_communication")
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
#else /* WITH_MPI */
           if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
#if REALCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
#endif
#if COMPLEXCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
#endif
             endif
           enddo
#endif /* WITH_OPENMP */
867
868
869
870
871
872
         enddo ! lc

         if (useGPU) then
           ! store column tiles back to GPU
           cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
           if (my_pcol == cur_pcol) then
873
             successCUDA = cuda_memcpy2d((a_dev+        &
874
875
876
877
878
                                         int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)),    &
                                         int(lda*size_of_datatype,kind=c_intptr_t), loc(a(1,lc_start)), &
                                         int(lda*size_of_datatype,kind=c_intptr_t),           &
                                         int(lr_end*size_of_datatype,kind=c_intptr_t),        &
                                         int((lc_end - lc_start+1),kind=c_intptr_t), &
879
                                         int(cudaMemcpyHostToDevice,kind=c_int))
880

881
             if (.not.(successCUDA)) then
882
               print *, "bandred_&
883
884
                        &MATH_DATATYPE&
                        &: cuda memcpy a_dev  failed ", istat
Andreas Marek's avatar
Andreas Marek committed
885
               stop 1
886
             endif
887
888
889
890
891
892
893
           endif
         endif

         ! Calculate scalar products of stored Householder vectors.
         ! This can be done in different ways, we use dsyrk

         vav = 0
894
         call obj%timer%start("blas")
895
896
         if (useGPU) then
           if (l_rows>0) &
Andreas Marek's avatar
Andreas Marek committed
897
898
899
900
901
902
#if REALCASE == 1
             call PRECISION_SYRK('U', 'T',            &
#endif
#if COMPLEXCASE == 1
             call PRECISION_HERK('U', 'C',            &
#endif
Andreas Marek's avatar
Retab    
Andreas Marek committed
903
                           n_cols, l_rows, ONE, &
904
905
906
                           vmrCUDA, cur_l_rows, &
                           ZERO, vav, ubound(vav,dim=1))

907
         else ! useGPU
908
           if (l_rows>0) &
Andreas Marek's avatar
Andreas Marek committed
909
910
#if REALCASE == 1
             call PRECISION_SYRK('U', 'T',           &
911
912
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
913
             call PRECISION_HERK('U', 'C',           &
914
#endif
Andreas Marek's avatar
Retab    
Andreas Marek committed
915
                           n_cols, l_rows, ONE, vmrCPU, ubound(vmrCPU,dim=1), ZERO, vav, ubound(vav,dim=1))
Andreas Marek's avatar
Andreas Marek committed
916
         endif
917
         call obj%timer%stop("blas")
918
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
919
         call symm_matrix_allreduce_&
920
921
#endif
#if COMPLEXCASE == 1
922
         call herm_matrix_allreduce_&
923
#endif
Andreas Marek's avatar
Andreas Marek committed
924
         &PRECISION &
925
                         (obj, n_cols,vav, nbw, nbw,mpi_comm_rows)
926
         ! Calculate triangular matrix T for block Householder Transformation
927
         call obj%timer%start("blas")
928
929
930
         do lc=n_cols,1,-1
           tau = tmat(lc,lc,istep)
           if (lc<n_cols) then
931
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
932
             call PRECISION_TRMV('U', 'T', 'N',          &
933
934
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
935
             call PRECISION_TRMV('U', 'C', 'N',          &
936
#endif
Andreas Marek's avatar
Andreas Marek committed
937
                                 n_cols-lc, tmat(lc+1,lc+1,istep), ubound(tmat,dim=1), vav(lc+1,lc), 1)
938
939

#if REALCASE == 1
940
             tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
941
942
943
944
#endif
#if COMPLEXCASE == 1
             tmat(lc,lc+1:n_cols,istep) = -tau * conjg(vav(lc+1:n_cols,lc))
#endif
945
946
           endif
         enddo
947
         call obj%timer%stop("blas")
948
949
950
#if REALCASE == 1
       endif !useQR
#endif
951
952
       ! Transpose vmr -> vmc (stored in umc, second half)
       if (useGPU) then
953
         call elpa_transpose_vectors_&
Andreas Marek's avatar
Andreas Marek committed
954
955
956
              &MATH_DATATYPE&
              &_&
              &PRECISION &
957
958
959
                           (obj, vmrCUDA, cur_l_rows, mpi_comm_rows, &
                            umcCUDA(cur_l_cols * n_cols + 1), cur_l_cols, &
                            mpi_comm_cols, 1, istep*nbw, n_cols, nblk)
960
       else ! useGPU
961
         call elpa_transpose_vectors_&
Andreas Marek's avatar
Andreas Marek committed
962
963
964
              &MATH_DATATYPE&
              &_&
              &PRECISION &
965
                                           (obj, vmrCPU, ubound(vmrCPU,dim=1), mpi_comm_rows, &
966
967
968
969
970
971
972
973
                                            umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), mpi_comm_cols, &
                                            1, istep*nbw, n_cols, nblk)
       endif

       ! Calculate umc = A**T * vmr
       ! Note that the distributed A has to be transposed
       ! Opposed to direct tridiagonalization there is no need to use the cache locality
       ! of the tiles, so we can use strips of the matrix
Andreas Marek's avatar
Andreas Marek committed
974
975


976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
#if 0
       ! original complex implemetation check for performance
       umcCPU(1:l_cols,1:n_cols) = CONST_COMPLEX_0_0
       vmrCPU(1:l_rows,n_cols+1:2*n_cols) = CONST_COMPLEX_0_0

       if (l_cols>0 .and. l_rows>0) then
         do i=0,(istep*nbw-1)/tile_size

           lcs = i*l_cols_tile+1
           lce = min(l_cols,(i+1)*l_cols_tile)
           if (lce<lcs) cycle

           lre = min(l_rows,(i+1)*l_rows_tile)

             call obj%timer%start("blas")
             call PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, a(1,lcs), ubound(a,dim=1), &
                        vmrCPU, ubound(vmrCPU,dim=1), ONE, umcCPU(lcs,1), ubound(umcCPU,dim=1))
             call obj%timer%stop("blas")

           if (i==0) cycle
           lre = min(l_rows,i*l_rows_tile)
             call obj%timer%start("blas")
             call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a(1,lcs), lda, &
                        umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
             call obj%timer%stop("blas")
         enddo

       endif ! (l_cols>0 .and. l_rows>0)
#endif /* if 0 */

       !Code for Algorithm 4

       ! n_way is actually a branch for the number of OpenMP threads
       n_way = 1
#ifdef WITH_OPENMP
1011

Andreas Marek's avatar
Andreas Marek committed
1012
#if REALCASE == 1
1013
1014
1015
       n_way = omp_get_max_threads()

       !$omp parallel private( i,lcs,lce,lrs,lre)
Andreas Marek's avatar
Andreas Marek committed
1016
#endif