elpa2_bandred_template.F90 66.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
63
64
65
66
    subroutine bandred_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
67
    (obj, na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
Andreas Marek's avatar
Andreas Marek committed
68
     tmat_dev, wantDebug, useGPU, success &
69
#if REALCASE == 1
70
     , useQR)
71
72
#endif
#if COMPLEXCASE == 1
73
     )
74
#endif
75

76
  !-------------------------------------------------------------------------------
77
  !  bandred_real/complex: Reduces a distributed symmetric matrix to band form
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
  !
  !  Parameters
  !
  !  na          Order of matrix
  !
  !  a(lda,matrixCols)    Distributed matrix which should be reduced.
  !              Distribution is like in Scalapack.
  !              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
  !              a(:,:) is overwritten on exit with the band and the Householder vectors
  !              in the upper half.
  !
  !  lda         Leading dimension of a
  !  matrixCols  local columns of matrix a
  !
  !  nblk        blocksize of cyclic distribution, must be the same in both directions!
  !
  !  nbw         semi bandwith of output matrix
  !
  !  mpi_comm_rows
  !  mpi_comm_cols
  !              MPI-Communicators for rows/columns
  !
  !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
  !              Factors for the Householder vectors (returned), needed for back transformation
  !
  !-------------------------------------------------------------------------------

      use cuda_functions
      use iso_c_binding
      use elpa1_compute
#ifdef WITH_OPENMP
      use omp_lib
#endif
      use precision
112
      use elpa_abstract_impl
113
114
      implicit none

115
      class(elpa_abstract_impl_t), intent(inout) :: obj
116
117
118
      integer(kind=ik)                            :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols

#if REALCASE == 1
119
#ifdef USE_ASSUMED_SIZE
120
      real(kind=REAL_DATATYPE)                    :: a(lda,*), tmat(nbw,nbw,*)
121
#else
122
      real(kind=REAL_DATATYPE)                    :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
123
#endif
124
125
126
127
128
129
130
#endif
#if COMPLEXCASE == 1
#ifdef USE_ASSUMED_SIZE
      complex(kind=COMPLEX_DATATYPE)              :: a(lda,*), tmat(nbw,nbw,*)
#else
      complex(kind=COMPLEX_DATATYPE)              :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
#endif
Andreas Marek's avatar
Andreas Marek committed
131
132
133
134
135
136
137
138
139
140
141
142
#endif /* COMPLEXCASE */

#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
      real(kind=REAL_DATATYPE), parameter         :: ZERO = 0.0_rk8, ONE = 1.0_rk8

#else
      real(kind=REAL_DATATYPE), parameter         :: ZERO = 0.0_rk4, ONE = 1.0_rk4
#endif
#endif

#if COMPLEXCASE == 1
143
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
144
      complex(kind=COMPLEX_DATATYPE), parameter   :: ZERO = (0.0_rk8, 0.0_rk8), ONE = (1.0_rk8, 0.0_rk8)
145
#else
Andreas Marek's avatar
Andreas Marek committed
146
      complex(kind=COMPLEX_DATATYPE), parameter   :: ZERO = (0.0_rk4, 0.0_rk4), ONE = (1.0_rk4, 0.0_rk4)
147
148
#endif
#endif /* COMPLEXCASE == 1 */
149

150
151
152
153
#if REALCASE == 1
      real(kind=REAL_DATATYPE)                    :: eps
#endif
      logical, intent(in)                         :: useGPU
154

155
156
157
      integer(kind=ik)                            :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)                            :: l_cols, l_rows
#if REALCASE == 1
158
      integer(kind=ik)                            :: vmrCols
159
#endif
160
      integer(kind=ik)                            :: mynlc
161
162
163
      integer(kind=ik)                            :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow
      integer(kind=ik)                            :: istep, ncol, lch, lcx, nlc
      integer(kind=ik)                            :: tile_size, l_rows_tile, l_cols_tile
164

165
166
167
168
169
170
      real(kind=REAL_DATATYPE)                    :: vnorm2
#if REALCASE == 1
      real(kind=REAL_DATATYPE)                    :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE)              :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
171
#endif
172

173
174
175
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
      complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
Andreas Marek's avatar
Andreas Marek committed
176
      complex(kind=COMPLEX_DATATYPE), allocatable :: vr(:)
177
#endif
178

179
180
181
182
183
184
185
186
187
188
189
190
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable       :: tmpCUDA(:),  vmrCUDA(:),  umcCUDA(:)
      real(kind=REAL_DATATYPE), allocatable       :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
      real(kind=REAL_DATATYPE), allocatable       :: vr(:)
#endif

#if REALCASE == 1
      ! needed for blocked QR decomposition
      integer(kind=ik)                            :: PQRPARAM(11), work_size
      real(kind=REAL_DATATYPE)                    :: dwork_size(1)
      real(kind=REAL_DATATYPE), allocatable       :: work_blocked(:), tauvector(:), blockheuristic(:)
#endif
191
      ! a_dev is passed from bandred_real to trans_ev_band
192
      integer(kind=C_intptr_T)                    :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
193
#ifdef WITH_MPI
194
195
196
197
      integer(kind=ik), external                  :: numroc
#endif
      integer(kind=ik)                            :: ierr
      integer(kind=ik)                            :: cur_l_rows, cur_l_cols, vmr_size, umc_size
198
      integer(kind=c_intptr_t)                      :: lc_start, lc_end
199
#if COMPLEXCASE == 1
200
      integer(kind=c_intptr_t)                      :: lce_1, lcs_1, lre_1
201
202
203
204
205
#endif
      integer(kind=ik)                            :: lr_end
      integer(kind=ik)                            :: na_cols
#if COMPLEXCASE == 1
      integer(kind=ik)                            :: na_rows
206
207
#endif

208
209
210
211
212
      logical, intent(in)                         :: wantDebug
      logical, intent(out)                        :: success
      logical                                     :: successCUDA
      integer(kind=ik)                            :: istat
      character(200)                              :: errorMessage
213

214
215
216
217
218
#if REALCASE == 1
      logical, intent(in)                         :: useQR
#endif
      integer(kind=ik)                            :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
                                                    ii, pp, transformChunkSize
219
      integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
220
221
222
                                                                        &PRECISION&
                                                                        &_&
                                                                        &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
223

224
      call obj%timer%start("bandred_&
Andreas Marek's avatar
Andreas Marek committed
225
226
227
228
      &MATH_DATATYPE&
      &" // &
      &PRECISION_SUFFIX &
      )
229
      if (wantDebug) call obj%timer%start("mpi_communication")
230
231
232
233
234

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
Andreas Marek's avatar
Andreas Marek committed
235

236
      if (wantDebug) call obj%timer%stop("mpi_communication")
237
238
239
240
241
242
243
      success = .true.


      ! Semibandwith nbw must be a multiple of blocksize nblk
      if (mod(nbw,nblk)/=0) then
        if (my_prow==0 .and. my_pcol==0) then
          if (wantDebug) then
Andreas Marek's avatar
Andreas Marek committed
244
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
245
246
      &MATH_DATATYPE&
      &: ERROR: nbw=',nbw,', nblk=',nblk
Andreas Marek's avatar
Andreas Marek committed
247
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
248
249
      &MATH_DATATYPE&
      &: ELPA2 works only for nbw==n*nblk'
250
251
252
253
254
255
256
257
258
          endif
          success = .false.
          return
        endif
      endif

! na_rows in used nowhere; only na_cols
      if (useGPU) then
#ifdef WITH_MPI
259
#if COMPLEXCASE == 1
260
        na_rows = numroc(na, nblk, my_prow, 0, np_rows)
261
#endif
262
263
        na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
264
265
266
#if COMPLEXCASE == 1
         na_rows = na
#endif
267
        na_cols = na
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
268
#endif /* WITH_MPI */
269
270

        ! Here we convert the regular host array into a pinned host array
271
        successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype)
272
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
273
          print *,"bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
274
275
    &MATH_DATATYPE&
    &: error in cudaMalloc a_dev 1"
Andreas Marek's avatar
Andreas Marek committed
276
          stop 1
277
278
        endif

279
        successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
280
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
281
          print *,"bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
282
283
    &MATH_DATATYPE&
    &: error in cudaMalloc tmat_dev 1"
Andreas Marek's avatar
Andreas Marek committed
284
          stop 1
285
286
        endif

287
        successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
288
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
289
          print *,"bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
290
291
    &MATH_DATATYPE&
    &: error in cudaMalloc vav_dev 1"
Andreas Marek's avatar
Andreas Marek committed
292
          stop 1
293
        endif
294
295
296
297
298
299
300
301
302
303
      endif ! useGPU

      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

      l_rows_tile = tile_size/np_rows ! local rows of a tile
      l_cols_tile = tile_size/np_cols ! local cols of a tile

304
#if REALCASE == 1
305
306
307
308
      if (useQR) then

        if (useGPU) then
          print *,"qr decomposition at the moment not supported with GPU"
Andreas Marek's avatar
Andreas Marek committed
309
          stop 1
310
311
312
        endif

        if (which_qr_decomposition == 1) then
313
          call qr_pqrparam_init(obj,pqrparam(1:11),    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
314
315
316
          allocate(tauvector(na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating tauvector "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
317
            stop 1
318
319
320
321
322
          endif

          allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating blockheuristic "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
323
            stop 1
324
325
326
327
328
329
          endif

          l_rows = local_index(na, my_prow, np_rows, nblk, -1)
          allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
330
            stop 1
331
332
333
334
335
          endif

          vmrCols = na

#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
336
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
337
338
    &PRECISION&
    &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
339
340
341
342
                                 nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
343
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
344
345
    &PRECISION&
    &(obj, a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
346
347
348
349
350
351
352
353
354
                                 vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
                                 nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)
#endif

          work_size = dwork_size(1)
          allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating work_blocked "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
355
            stop 1
356
          endif
357
          work_blocked = CONST_0_0
358
359
360
          deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
361
            stop 1
362
363
364
365
366
          endif

        endif ! which_qr_decomposition

      endif ! useQr
Andreas Marek's avatar
Andreas Marek committed
367
#endif /* REALCASE */
368
369
370
371
372

      if (useGPU) then

        cur_l_rows = 0
        cur_l_cols = 0
Andreas Marek's avatar
Andreas Marek committed
373

374
        successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
375
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
376
          print *,"bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
377
378
    &MATH_DATATYPE&
    &: error in cudaMemcpy a_dev 2"
Andreas Marek's avatar
Andreas Marek committed
379
          stop 1
380
        endif
381
382
383
384
385
386
387
388
389
390
391
      endif ! useGPU


      do istep = (na-1)/nbw, 1, -1

        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step

        ! Number of local columns/rows of remaining matrix
        l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
        l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)

392
393
        ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces

394
395
396
397
398
399
400
401
402
403
404
405
406
        if (useGPU) then
          cur_l_rows = max(l_rows, 1)
          cur_l_cols = max(l_cols, 1)

          vmr_size = cur_l_rows * 2 * n_cols
          umc_size = cur_l_cols * 2 * n_cols

          ! Allocate vmr and umc only if the inew size exceeds their current capacity
          ! Added for FORTRAN CALLS
          if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
            if (allocated(vr)) then
              deallocate(vr, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
407
                print *,"bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
408
409
    &MATH_DATATYPE&
    &: error when deallocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
410
                stop 1
411
412
413
414
              endif
            endif
            allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
            if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
415
              print *,"bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
416
417
        &MATH_DATATYPE&
        &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
418
              stop 1
419
420
421
422
423
424
425
426
            endif

          endif

          if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then
            if (allocated(vmrCUDA)) then
              deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
427
                print *,"bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
428
429
    &MATH_DATATYPE&
    &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
430
                stop 1
431
432
433
434
              endif

              successCUDA = cuda_free(vmr_dev)
              if (.not.(successCUDA)) then
435
                print *,"bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
436
    &MATH_DATATYPE&: error in cuda_free vmr_dev 1"
Andreas Marek's avatar
Andreas Marek committed
437
                stop 1
438
439
440
              endif
            endif

441
#if REALCASE == 1
442
            allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage)
443
444
#endif
#if COMPLEXCASE == 1
445
446
            allocate(vmrCUDA(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
#endif
447
            if (istat .ne. 0) then
448
              print *,"bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
449
450
        &MATH_DATATYPE&
        &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
451
              stop 1
452
            endif
453
            successCUDA = cuda_malloc(vmr_dev, vmr_size* size_of_datatype)
454
            if (.not.(successCUDA)) then
455
              print *,"bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
456
457
        &MATH_DATATYPE&
        &: error in cudaMalloc: vmr_dev2"
Andreas Marek's avatar
Andreas Marek committed
458
              stop 1
459
460
461
            endif

          endif
462
463
464
465
466

          if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then
            if (allocated(umcCUDA)) then
              deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
467
                print *,"bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
468
469
    &MATH_DATATYPE&
    &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
470
                stop 1
471
472
473
474
              endif

              successCUDA = cuda_free(umc_dev)
              if (.not.(successCUDA)) then
475
                 print *,"bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
476
477
     &MATH_DATATYPE&
     &: error in cudaFree umc_dev 1"
Andreas Marek's avatar
Andreas Marek committed
478
                 stop 1
479
480
481
              endif

            endif
482

483
484
485
#if REALCASE == 1
            allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
#endif
486
#if COMPLEXCASE == 1
487
488
            allocate(umcCUDA(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
#endif
489
            if (istat .ne. 0) then
490
              print *,"bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
491
492
        &MATH_DATATYPE&
        &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
493
              stop 1
494
495
            endif

496
            successCUDA = cuda_malloc(umc_dev, umc_size* size_of_datatype)
497
            if (.not.(successCUDA)) then
498
              print *,"bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
499
500
        &MATH_DATATYPE&
        &: error in cudaMalloc umc_dev 2"
Andreas Marek's avatar
Andreas Marek committed
501
              stop 1
502
            endif
503

504
          endif
505
506
507

        else ! GPU not used

508
509
510
          ! unify the the name vmr and vmrCPU, as well as vmrGPU
          ! the same for umcCPU and umcGPU
          ! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces
511
512
513

          allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
514
            print *,"bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
515
516
      &MATH_DATATYPE&
      &: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
517
            stop 1
518
519
          endif

520
521
          allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
522
            print *,"bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
523
524
      &MATH_DATATYPE&
      &: error when allocating umcCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
525
            stop 1
526
          endif
Andreas Marek's avatar
Andreas Marek committed
527

528
529
          allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
530
            print *,"bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
531
532
      &MATH_DATATYPE&
      &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
533
            stop 1
534
535
536
537
538
          endif

        endif ! use GPU

        if (useGPU) then
539
#if REALCASE == 1
540
          vmrCUDA(1 : cur_l_rows * n_cols) = CONST_0_0
541
542
543
#endif
#if COMPLEXCASE == 1
          vmrCUDA(1:l_rows,1:n_cols) = CONST_COMPLEX_0_0
544
#endif
545
        else
546
#if REALCASE == 1
547
          vmrCPU(1:l_rows,1:n_cols) = CONST_0_0
548
549
550
551
552
553
554
#endif
#if COMPLEXCASE == 1
          vmrCPU(1:l_rows,1:n_cols) = CONST_COMPLEX_0_0
#endif
        endif ! useGPU

#if REALCASE == 1
555
556
        vr(:) = CONST_0_0
        tmat(:,:,istep) = CONST_0_0
557
558
559
560
561
#endif
#if COMPLEXCASE == 1
        vr(:) = CONST_COMPLEX_0_0
        tmat(:,:,istep) = CONST_COMPLEX_0_0
#endif
562
        if (useGPU) then
563
#if REALCASE == 1
564
          umcCUDA(1 : umc_size) = CONST_0_0
565
#endif
566
567
568
569
          lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
          lc_end   = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
          lr_end   = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)

570
          if (lc_start .le. 0) lc_start = 1
571
572
573
574

          ! Here we assume that the processor grid and the block grid are aligned
          cur_pcol = pcol(istep*nbw+1, nblk, np_cols)

575
576
577
          if (my_pcol == cur_pcol) then
            successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), &
#if REALCASE == 1
578
                                            lda * size_of_datatype, &
579
580
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
581
                                      int((lda*size_of_datatype),kind=c_intptr_t), &
582
#endif
583
#if REALCASE == 1
584
                                            (a_dev + ((lc_start-1) * lda*size_of_datatype)),    &
585
586
#endif
#if COMPLEXCASE == 1
587
                                            (a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )),      &
588
#endif
589
#if REALCASE == 1
590
                                            lda*size_of_datatype, lr_end*size_of_datatype, &
591
592
#endif
#if COMPLEXCASE == 1
593
594
                                            int(lda*size_of_datatype,kind=c_intptr_t),              &
                                            int(lr_end*size_of_datatype,kind=c_intptr_t),           &
595
596
#endif
#if REALCASE == 1
597
                                             (lc_end - lc_start+1), cudaMemcpyDeviceToHost)
598
599
#endif
#if COMPLEXCASE == 1
600
                                             int((lc_end - lc_start+1),kind=c_intptr_t),int(cudaMemcpyDeviceToHost,kind=c_int))
601
602
#endif

603

604
            if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
605
              print *,"bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
606
607
        &MATH_DATATYPE&
        &: error in cudaMemcpy2d"
Andreas Marek's avatar
Andreas Marek committed
608
              stop 1
609
610
611
612
613
614
            endif

          endif
        endif ! useGPU

        ! Reduce current block to lower triangular form
615
#if REALCASE == 1
616
617
618
619
        if (useQR) then
          if (which_qr_decomposition == 1) then
            vmrCols = 2*n_cols
#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
620
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
621
622
      &PRECISION&
      &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
623
624
625
626
627
628
629
                                   na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size,        &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
630
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
631
632
      &PRECISION&
      &(obj, a(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) ,   &
633
634
635
636
637
638
639
640
641
642
                                    max(l_rows,1), vmrCols, tauvector(1:na), na, &
                                     tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)
#endif
          endif

       else !useQR
643
#endif /* REALCASE == 1 */
644
645
         do lc = n_cols, 1, -1

646
           ncol = istep*nbw + lc ! absolute column number of householder Vector
647
648
649
650
651
652
653
654
655
656
657
658
659
           nrow = ncol - nbw ! Absolute number of pivot row

           lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
           lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number

           tau = 0

           if (nrow == 1) exit ! Nothing to do

           cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block

           if (my_pcol==cur_pcol) then

660
             ! Get Vector to be transformed; distribute last element and norm of
661
662
             ! remaining elements to all procs in current column

663
             vr(1:lr) = a(1:lr,lch) ! Vector to be transformed
664
665
666
667
668
669

             if (my_prow==prow(nrow, nblk, np_rows)) then
               aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
               aux1(2) = vr(lr)
             else
               aux1(1) = dot_product(vr(1:lr),vr(1:lr))
670
#if REALCASE == 1
671
               aux1(2) = CONST_0_0
672
673
674
675
#endif
#if COMPLEXCASE == 1
               aux1(2) = CONST_COMPLEX_0_0
#endif
676
677
678
             endif

#ifdef WITH_MPI
679
             if (wantDebug) call obj%timer%start("mpi_communication")
Andreas Marek's avatar
Andreas Marek committed
680
             call mpi_allreduce(aux1, aux2, 2, &
681
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
682
                                MPI_REAL_PRECISION, &
683
684
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
685
                                MPI_COMPLEX_PRECISION, &
686
#endif
Andreas Marek's avatar
Andreas Marek committed
687
                                MPI_SUM, mpi_comm_rows, mpierr)
688
             if (wantDebug) call obj%timer%stop("mpi_communication")
689
690
691

#else /* WITH_MPI */
              aux2 = aux1 ! this should be optimized
Andreas Marek's avatar
Andreas Marek committed
692
#endif
693
694
695
696
697

             vnorm2 = aux2(1)
             vrl    = aux2(2)

             ! Householder transformation
698
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
699
       call hh_transform_real_&
700
701
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
702
       call hh_transform_complex_&
703
#endif
Andreas Marek's avatar
Andreas Marek committed
704
             &PRECISION &
705
                              (obj, vrl, vnorm2, xf, tau, wantDebug)
706
             ! Scale vr and store Householder Vector for back transformation
707
708
709
710
711

             vr(1:lr) = vr(1:lr) * xf
             if (my_prow==prow(nrow, nblk, np_rows)) then
               a(1:lr-1,lch) = vr(1:lr-1)
               a(lr,lch) = vrl
712
#if REALCASE == 1
713
               vr(lr) = CONST_1_0
714
715
716
717
#endif
#if COMPLEXCASE == 1
               vr(lr) = CONST_COMPLEX_1_0
#endif
718
719
720
721
722
723
             else
               a(1:lr,lch) = vr(1:lr)
             endif

           endif

724
           ! Broadcast Householder Vector and tau along columns
725
726
727

           vr(lr+1) = tau
#ifdef WITH_MPI
728
           if (wantDebug) call obj%timer%start("mpi_communication")
Andreas Marek's avatar
Retab  
Andreas Marek committed
729
     call MPI_Bcast(vr, lr+1, &
730
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
731
                          MPI_REAL_PRECISION, &
732
733
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
734
                          MPI_COMPLEX_PRECISION, &
735
#endif
Andreas Marek's avatar
Andreas Marek committed
736
                          cur_pcol, mpi_comm_cols, mpierr)
737
           if (wantDebug) call obj%timer%stop("mpi_communication")
738
739

#endif /* WITH_MPI */
740

741
           if (useGPU) then
742
#if REALCASE == 1
743
             vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
744
745
#endif
#if COMPLEXCASE == 1
746
             vmrCUDA(1:lr,lc) = vr(1:lr)
747
#endif
748
749
750
           else
             vmrCPU(1:lr,lc) = vr(1:lr)
           endif
751
752
           tau = vr(lr+1)

753
754
755
756
757
758
#if REALCASE == 1
           tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
#endif
#if COMPLEXCASE == 1
           tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
#endif
759
           ! Transform remaining columns in current block with Householder Vector
760
761
           ! Local dot product

762
#if REALCASE == 1
763
           aux1 = 0
764
765
766
767
#endif
#if COMPLEXCASE == 1
          aux1 = CONST_COMPLEX_0_0
#endif
768
769

#ifdef WITH_OPENMP
770
771
772
#if 0
 ! original complex implementation without openmp. check performance
            nlc = 0 ! number of local columns
773
774
775
776
777
778
779
780
781
782
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
783
           if (wantDebug) call obj%timer%start("mpi_communication")
784
785
786
787
788
789
790
791
792
793
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
Andreas Marek's avatar
Andreas Marek committed
794

795
796
797
             endif
           enddo

798

799
           if (wantDebug) call obj%timer%stop("mpi_communication")
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824

#else /* WITH_MPI */
!          if (nlc>0) aux2=aux1

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
             endif
           enddo

#endif /* WITH_MPI */
!
!           ! Transform
!
!           nlc = 0
!           do j=1,lc-1
!             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
!             if (lcx>0) then
!               nlc = nlc+1
!               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
825

826
827
!             endif
!           enddo
828
#endif /* if 0 */
829

830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
           !Open up one omp region to avoid paying openmp overhead.
           !This does not help performance due to the addition of two openmp barriers around the MPI call,
           !But in the future this may be beneficial if these barriers are replaced with a faster implementation

           !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
           mynlc = 0 ! number of local columns

           !This loop does not have independent iterations,
           !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
           !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
           !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
           !the iteration number modulo the number of threads
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0 ) then
               mynlc = mynlc+1
               if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
                   if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
               endif
             endif
           enddo

           ! Get global dot products

           !$omp barrier
           !$omp single
#ifdef WITH_MPI
857
           if (wantDebug) call obj%timer%start("mpi_communication")
858
859
           if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc,     &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
860
                                     MPI_REAL_PRECISION,    &
861
862
863
864
#endif
#if COMPLEXCASE == 1
                                           MPI_COMPLEX_PRECISION, &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
865
                         MPI_SUM, mpi_comm_rows, mpierr)
866
           if (wantDebug) call obj%timer%stop("mpi_communication")
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
#else /* WITH_MPI */
           if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
           !$omp end single
           !$omp barrier

           ! Transform
           transformChunkSize=32
           mynlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               mynlc = mynlc+1
               !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
               !However, for some reason this is slower than doing it manually, so it is parallelized as below.
               do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
                  do pp = 1,transformChunkSize
                      if (pp + ii > lr) exit
#if REALCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
#endif
#if COMPLEXCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp)
#endif
                  enddo
               enddo
             endif
           enddo
           !$omp end parallel
896
897
898
899
900
901
902
903
904
905
906
907
908
909

#else /* WITH_OPENMP */

           nlc = 0 ! number of local columns
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
910
           if (wantDebug) call obj%timer%start("mpi_communication")
911
912
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc,      &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
913
                                   MPI_REAL_PRECISION,   &
914
915
916
#endif
#if COMPLEXCASE == 1
                                         MPI_COMPLEX_PRECISION,&
917
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
918
           MPI_SUM, mpi_comm_rows, mpierr)
919
           if (wantDebug) call obj%timer%stop("mpi_communication")
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
#else /* WITH_MPI */
           if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
#if REALCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
#endif
#if COMPLEXCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
#endif
             endif
           enddo
#endif /* WITH_OPENMP */
939
940
941
942
943
944
         enddo ! lc

         if (useGPU) then
           ! store column tiles back to GPU
           cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
           if (my_pcol == cur_pcol) then
945
             successCUDA = cuda_memcpy2d((a_dev+        &
946
#if REALCASE == 1
947
                                         ((lc_start-1)*lda*size_of_datatype)),          &
948
949
#endif
#if COMPLEXCASE == 1
950
                                         int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)),    &
951
952
#endif
#if REALCASE == 1
953
                                         lda*size_of_datatype, loc(a(1, lc_start)),    &
954
955
#endif
#if COMPLEXCASE == 1
956
                                         int(lda*size_of_datatype,kind=c_intptr_t), loc(a(1,lc_start)), &
957
958
#endif
#if REALCASE == 1
959
                                         lda*size_of_datatype,  lr_end*size_of_datatype, &
960
961
#endif
#if COMPLEXCASE == 1
962
963
                                         int(lda*size_of_datatype,kind=c_intptr_t),           &
                                         int(lr_end*size_of_datatype,kind=c_intptr_t),        &
964
965
966
#endif
#if REALCASE == 1
                                         (lc_end - lc_start+1),cudaMemcpyHostToDevice)
967
968
#endif
#if COMPLEXCASE == 1
969
                                         int((lc_end - lc_start+1),kind=c_intptr_t), &
970
971
                                         int(cudaMemcpyHostToDevice,kind=c_int))
#endif
972

973
             if (.not.(successCUDA)) then
974
               print *, "bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
975
976
         &MATH_DATATYPE&
         &: cuda memcpy a_dev  failed ", istat
Andreas Marek's avatar
Andreas Marek committed
977
               stop 1