elpa2_bandred_template.F90 67.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
63 64 65 66
    subroutine bandred_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
67
    (obj, na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
Andreas Marek's avatar
Andreas Marek committed
68
     tmat_dev, wantDebug, useGPU, success &
69
#if REALCASE == 1
70
     , useQR)
71 72
#endif
#if COMPLEXCASE == 1
73
     )
74
#endif
75

76
  !-------------------------------------------------------------------------------
77
  !  bandred_real/complex: Reduces a distributed symmetric matrix to band form
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
  !
  !  Parameters
  !
  !  na          Order of matrix
  !
  !  a(lda,matrixCols)    Distributed matrix which should be reduced.
  !              Distribution is like in Scalapack.
  !              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
  !              a(:,:) is overwritten on exit with the band and the Householder vectors
  !              in the upper half.
  !
  !  lda         Leading dimension of a
  !  matrixCols  local columns of matrix a
  !
  !  nblk        blocksize of cyclic distribution, must be the same in both directions!
  !
  !  nbw         semi bandwith of output matrix
  !
  !  mpi_comm_rows
  !  mpi_comm_cols
  !              MPI-Communicators for rows/columns
  !
  !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
  !              Factors for the Householder vectors (returned), needed for back transformation
  !
  !-------------------------------------------------------------------------------

      use cuda_functions
      use iso_c_binding
      use elpa1_compute
#ifdef WITH_OPENMP
      use omp_lib
#endif
      use precision
112
      use elpa_abstract_impl
113 114
      implicit none

115
      class(elpa_abstract_impl_t), intent(inout) :: obj
116 117 118
      integer(kind=ik)                            :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols

#if REALCASE == 1
119
#ifdef USE_ASSUMED_SIZE
120
      real(kind=REAL_DATATYPE)                    :: a(lda,*), tmat(nbw,nbw,*)
121
#else
122
      real(kind=REAL_DATATYPE)                    :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
123
#endif
124 125 126 127 128 129 130
#endif
#if COMPLEXCASE == 1
#ifdef USE_ASSUMED_SIZE
      complex(kind=COMPLEX_DATATYPE)              :: a(lda,*), tmat(nbw,nbw,*)
#else
      complex(kind=COMPLEX_DATATYPE)              :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
#endif
Andreas Marek's avatar
Andreas Marek committed
131 132 133 134 135 136 137 138 139 140 141 142
#endif /* COMPLEXCASE */

#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
      real(kind=REAL_DATATYPE), parameter         :: ZERO = 0.0_rk8, ONE = 1.0_rk8

#else
      real(kind=REAL_DATATYPE), parameter         :: ZERO = 0.0_rk4, ONE = 1.0_rk4
#endif
#endif

#if COMPLEXCASE == 1
143
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
144
      complex(kind=COMPLEX_DATATYPE), parameter   :: ZERO = (0.0_rk8, 0.0_rk8), ONE = (1.0_rk8, 0.0_rk8)
145
#else
Andreas Marek's avatar
Andreas Marek committed
146
      complex(kind=COMPLEX_DATATYPE), parameter   :: ZERO = (0.0_rk4, 0.0_rk4), ONE = (1.0_rk4, 0.0_rk4)
147 148
#endif
#endif /* COMPLEXCASE == 1 */
149

150 151 152 153
#if REALCASE == 1
      real(kind=REAL_DATATYPE)                    :: eps
#endif
      logical, intent(in)                         :: useGPU
154

155 156 157
      integer(kind=ik)                            :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)                            :: l_cols, l_rows
#if REALCASE == 1
158
      integer(kind=ik)                            :: vmrCols
159
#endif
160
      integer(kind=ik)                            :: mynlc
161 162 163
      integer(kind=ik)                            :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow
      integer(kind=ik)                            :: istep, ncol, lch, lcx, nlc
      integer(kind=ik)                            :: tile_size, l_rows_tile, l_cols_tile
164

165 166 167 168 169 170
      real(kind=REAL_DATATYPE)                    :: vnorm2
#if REALCASE == 1
      real(kind=REAL_DATATYPE)                    :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE)              :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
171
#endif
172

173 174 175
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
      complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
Andreas Marek's avatar
Andreas Marek committed
176
      complex(kind=COMPLEX_DATATYPE), allocatable :: vr(:)
177
#endif
178

179 180 181 182 183 184 185 186 187 188 189 190
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable       :: tmpCUDA(:),  vmrCUDA(:),  umcCUDA(:)
      real(kind=REAL_DATATYPE), allocatable       :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
      real(kind=REAL_DATATYPE), allocatable       :: vr(:)
#endif

#if REALCASE == 1
      ! needed for blocked QR decomposition
      integer(kind=ik)                            :: PQRPARAM(11), work_size
      real(kind=REAL_DATATYPE)                    :: dwork_size(1)
      real(kind=REAL_DATATYPE), allocatable       :: work_blocked(:), tauvector(:), blockheuristic(:)
#endif
191
      ! a_dev is passed from bandred_real to trans_ev_band
192
      integer(kind=C_intptr_T)                    :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
193
#ifdef WITH_MPI
194 195 196 197
      integer(kind=ik), external                  :: numroc
#endif
      integer(kind=ik)                            :: ierr
      integer(kind=ik)                            :: cur_l_rows, cur_l_cols, vmr_size, umc_size
198
      integer(kind=c_intptr_t)                      :: lc_start, lc_end
199
#if COMPLEXCASE == 1
200
      integer(kind=c_intptr_t)                      :: lce_1, lcs_1, lre_1
201 202 203 204 205
#endif
      integer(kind=ik)                            :: lr_end
      integer(kind=ik)                            :: na_cols
#if COMPLEXCASE == 1
      integer(kind=ik)                            :: na_rows
206 207
#endif

208 209 210 211 212
      logical, intent(in)                         :: wantDebug
      logical, intent(out)                        :: success
      logical                                     :: successCUDA
      integer(kind=ik)                            :: istat
      character(200)                              :: errorMessage
213

214 215 216 217 218
#if REALCASE == 1
      logical, intent(in)                         :: useQR
#endif
      integer(kind=ik)                            :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
                                                    ii, pp, transformChunkSize
219
      integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
220 221 222
                                                                        &PRECISION&
                                                                        &_&
                                                                        &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
223

224
      call obj%timer%start("bandred_&
Andreas Marek's avatar
Andreas Marek committed
225 226 227 228
      &MATH_DATATYPE&
      &" // &
      &PRECISION_SUFFIX &
      )
229
      if (wantDebug) call obj%timer%start("mpi_communication")
230 231 232 233 234

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
Andreas Marek's avatar
Andreas Marek committed
235

236
      if (wantDebug) call obj%timer%stop("mpi_communication")
237 238 239 240 241 242 243
      success = .true.


      ! Semibandwith nbw must be a multiple of blocksize nblk
      if (mod(nbw,nblk)/=0) then
        if (my_prow==0 .and. my_pcol==0) then
          if (wantDebug) then
Andreas Marek's avatar
Andreas Marek committed
244
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
245 246
      &MATH_DATATYPE&
      &: ERROR: nbw=',nbw,', nblk=',nblk
Andreas Marek's avatar
Andreas Marek committed
247
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
248 249
      &MATH_DATATYPE&
      &: ELPA2 works only for nbw==n*nblk'
250 251 252 253 254 255 256 257 258
          endif
          success = .false.
          return
        endif
      endif

! na_rows in used nowhere; only na_cols
      if (useGPU) then
#ifdef WITH_MPI
259
#if COMPLEXCASE == 1
260
        na_rows = numroc(na, nblk, my_prow, 0, np_rows)
261
#endif
262 263
        na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
264 265 266
#if COMPLEXCASE == 1
         na_rows = na
#endif
267
        na_cols = na
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
268
#endif /* WITH_MPI */
269 270

        ! Here we convert the regular host array into a pinned host array
271
        successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype)
272
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
273
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
274 275
                  &MATH_DATATYPE&
                  &: error in cudaMalloc a_dev 1"
Andreas Marek's avatar
Andreas Marek committed
276
          stop 1
277 278
        endif

279
        successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
280
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
281
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
282 283
                  &MATH_DATATYPE&
                  &: error in cudaMalloc tmat_dev 1"
Andreas Marek's avatar
Andreas Marek committed
284
          stop 1
285 286
        endif

287
        successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
288
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
289
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
290 291
                  &MATH_DATATYPE&
                  &: error in cudaMalloc vav_dev 1"
Andreas Marek's avatar
Andreas Marek committed
292
          stop 1
293
        endif
294 295 296 297 298 299 300 301 302 303
      endif ! useGPU

      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

      l_rows_tile = tile_size/np_rows ! local rows of a tile
      l_cols_tile = tile_size/np_cols ! local cols of a tile

304
#if REALCASE == 1
305 306 307 308
      if (useQR) then

        if (useGPU) then
          print *,"qr decomposition at the moment not supported with GPU"
Andreas Marek's avatar
Andreas Marek committed
309
          stop 1
310 311 312
        endif

        if (which_qr_decomposition == 1) then
313
          call qr_pqrparam_init(obj,pqrparam(1:11),    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
314 315 316
          allocate(tauvector(na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating tauvector "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
317
            stop 1
318 319 320 321 322
          endif

          allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating blockheuristic "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
323
            stop 1
324 325 326 327 328 329
          endif

          l_rows = local_index(na, my_prow, np_rows, nblk, -1)
          allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
330
            stop 1
331 332 333 334 335
          endif

          vmrCols = na

#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
336
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
337 338
               &PRECISION&
               &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
339 340 341 342
                                 nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
343
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
344 345
               &PRECISION&
               &(obj, a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
346 347 348 349 350 351 352 353 354
                                 vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
                                 nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)
#endif

          work_size = dwork_size(1)
          allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating work_blocked "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
355
            stop 1
356
          endif
357
          work_blocked = CONST_0_0
358 359 360
          deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
361
            stop 1
362 363 364 365 366
          endif

        endif ! which_qr_decomposition

      endif ! useQr
Andreas Marek's avatar
Andreas Marek committed
367
#endif /* REALCASE */
368 369 370 371 372

      if (useGPU) then

        cur_l_rows = 0
        cur_l_cols = 0
Andreas Marek's avatar
Andreas Marek committed
373

374
        successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
375
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
376
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
377 378
                  &MATH_DATATYPE&
                  &: error in cudaMemcpy a_dev 2"
Andreas Marek's avatar
Andreas Marek committed
379
          stop 1
380
        endif
381 382 383 384 385 386 387 388 389 390 391
      endif ! useGPU


      do istep = (na-1)/nbw, 1, -1

        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step

        ! Number of local columns/rows of remaining matrix
        l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
        l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)

392 393
        ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces

394 395 396 397 398 399 400 401 402 403 404 405 406
        if (useGPU) then
          cur_l_rows = max(l_rows, 1)
          cur_l_cols = max(l_cols, 1)

          vmr_size = cur_l_rows * 2 * n_cols
          umc_size = cur_l_cols * 2 * n_cols

          ! Allocate vmr and umc only if the inew size exceeds their current capacity
          ! Added for FORTRAN CALLS
          if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
            if (allocated(vr)) then
              deallocate(vr, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
407
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
408 409
                        &MATH_DATATYPE&
                        &: error when deallocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
410
                stop 1
411 412 413 414
              endif
            endif
            allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
            if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
415
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
416 417
                      &MATH_DATATYPE&
                      &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
418
              stop 1
419 420 421 422 423 424 425 426
            endif

          endif

          if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then
            if (allocated(vmrCUDA)) then
              deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
427
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
428 429
                        &MATH_DATATYPE&
                        &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
430
                stop 1
431 432 433 434
              endif

              successCUDA = cuda_free(vmr_dev)
              if (.not.(successCUDA)) then
435
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
436
                        &MATH_DATATYPE&: error in cuda_free vmr_dev 1"
Andreas Marek's avatar
Andreas Marek committed
437
                stop 1
438 439 440
              endif
            endif

441
#if REALCASE == 1
442
            allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage)
443 444
#endif
#if COMPLEXCASE == 1
445 446
            allocate(vmrCUDA(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
#endif
447
            if (istat .ne. 0) then
448
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
449 450
                      &MATH_DATATYPE&
                      &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
451
              stop 1
452
            endif
453
            successCUDA = cuda_malloc(vmr_dev, vmr_size* size_of_datatype)
454
            if (.not.(successCUDA)) then
455
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
456 457
                      &MATH_DATATYPE&
                      &: error in cudaMalloc: vmr_dev2"
Andreas Marek's avatar
Andreas Marek committed
458
              stop 1
459 460 461
            endif

          endif
462 463 464 465 466

          if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then
            if (allocated(umcCUDA)) then
              deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
467
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
468 469
                        &MATH_DATATYPE&
                        &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
470
                stop 1
471 472 473 474
              endif

              successCUDA = cuda_free(umc_dev)
              if (.not.(successCUDA)) then
475
                 print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
476 477
                         &MATH_DATATYPE&
                         &: error in cudaFree umc_dev 1"
Andreas Marek's avatar
Andreas Marek committed
478
                 stop 1
479 480 481
              endif

            endif
482

483 484 485
#if REALCASE == 1
            allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
#endif
486
#if COMPLEXCASE == 1
487 488
            allocate(umcCUDA(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
#endif
489
            if (istat .ne. 0) then
490
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
491 492
                      &MATH_DATATYPE&
                      &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
493
              stop 1
494 495
            endif

496
            successCUDA = cuda_malloc(umc_dev, umc_size* size_of_datatype)
497
            if (.not.(successCUDA)) then
498
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
499 500
                      &MATH_DATATYPE&
                      &: error in cudaMalloc umc_dev 2"
Andreas Marek's avatar
Andreas Marek committed
501
              stop 1
502
            endif
503

504
          endif
505 506 507

        else ! GPU not used

508 509 510
          ! unify the the name vmr and vmrCPU, as well as vmrGPU
          ! the same for umcCPU and umcGPU
          ! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces
511 512 513

          allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
514
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
515 516
                     &MATH_DATATYPE&
                     &: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
517
            stop 1
518 519
          endif

520 521
          allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
522
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
523 524
                    &MATH_DATATYPE&
                    &: error when allocating umcCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
525
            stop 1
526
          endif
Andreas Marek's avatar
Andreas Marek committed
527

528 529
          allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
530
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
531 532
                    &MATH_DATATYPE&
                    &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
533
            stop 1
534 535 536 537 538
          endif

        endif ! use GPU

        if (useGPU) then
539
#if REALCASE == 1
540
          vmrCUDA(1 : cur_l_rows * n_cols) = CONST_0_0
541 542 543
#endif
#if COMPLEXCASE == 1
          vmrCUDA(1:l_rows,1:n_cols) = CONST_COMPLEX_0_0
544
#endif
545
        else
546
#if REALCASE == 1
547
          vmrCPU(1:l_rows,1:n_cols) = CONST_0_0
548 549 550 551 552 553 554
#endif
#if COMPLEXCASE == 1
          vmrCPU(1:l_rows,1:n_cols) = CONST_COMPLEX_0_0
#endif
        endif ! useGPU

#if REALCASE == 1
555 556
        vr(:) = CONST_0_0
        tmat(:,:,istep) = CONST_0_0
557 558 559 560 561
#endif
#if COMPLEXCASE == 1
        vr(:) = CONST_COMPLEX_0_0
        tmat(:,:,istep) = CONST_COMPLEX_0_0
#endif
562
        if (useGPU) then
563
#if REALCASE == 1
564
          umcCUDA(1 : umc_size) = CONST_0_0
565
#endif
566 567 568 569
          lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
          lc_end   = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
          lr_end   = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)

570
          if (lc_start .le. 0) lc_start = 1
571 572 573 574

          ! Here we assume that the processor grid and the block grid are aligned
          cur_pcol = pcol(istep*nbw+1, nblk, np_cols)

575 576 577
          if (my_pcol == cur_pcol) then
            successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), &
#if REALCASE == 1
578
                                            lda * size_of_datatype, &
579 580
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
581
                                      int((lda*size_of_datatype),kind=c_intptr_t), &
582
#endif
583
#if REALCASE == 1
584
                                            (a_dev + ((lc_start-1) * lda*size_of_datatype)),    &
585 586
#endif
#if COMPLEXCASE == 1
587
                                            (a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )),      &
588
#endif
589
#if REALCASE == 1
590
                                            lda*size_of_datatype, lr_end*size_of_datatype, &
591 592
#endif
#if COMPLEXCASE == 1
593 594
                                            int(lda*size_of_datatype,kind=c_intptr_t),              &
                                            int(lr_end*size_of_datatype,kind=c_intptr_t),           &
595 596
#endif
#if REALCASE == 1
597
                                             (lc_end - lc_start+1), cudaMemcpyDeviceToHost)
598 599
#endif
#if COMPLEXCASE == 1
600
                                             int((lc_end - lc_start+1),kind=c_intptr_t),int(cudaMemcpyDeviceToHost,kind=c_int))
601 602
#endif

603

604
            if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
605
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
606 607
                      &MATH_DATATYPE&
                      &: error in cudaMemcpy2d"
Andreas Marek's avatar
Andreas Marek committed
608
              stop 1
609 610 611 612 613 614
            endif

          endif
        endif ! useGPU

        ! Reduce current block to lower triangular form
615
#if REALCASE == 1
616 617 618 619
        if (useQR) then
          if (which_qr_decomposition == 1) then
            vmrCols = 2*n_cols
#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
620
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
621 622
                 &PRECISION&
                 &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
623 624 625 626 627 628 629
                                   na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size,        &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
630
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
631 632
                 &PRECISION&
                 &(obj, a(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) ,   &
633 634 635 636 637 638 639 640 641 642
                                    max(l_rows,1), vmrCols, tauvector(1:na), na, &
                                     tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)
#endif
          endif

       else !useQR
643
#endif /* REALCASE == 1 */
644 645
         do lc = n_cols, 1, -1

646
           ncol = istep*nbw + lc ! absolute column number of householder Vector
647 648 649 650 651 652 653 654 655 656 657 658 659
           nrow = ncol - nbw ! Absolute number of pivot row

           lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
           lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number

           tau = 0

           if (nrow == 1) exit ! Nothing to do

           cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block

           if (my_pcol==cur_pcol) then

660
             ! Get Vector to be transformed; distribute last element and norm of
661 662
             ! remaining elements to all procs in current column

663
             vr(1:lr) = a(1:lr,lch) ! Vector to be transformed
664 665 666 667 668 669

             if (my_prow==prow(nrow, nblk, np_rows)) then
               aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
               aux1(2) = vr(lr)
             else
               aux1(1) = dot_product(vr(1:lr),vr(1:lr))
670
#if REALCASE == 1
671
               aux1(2) = CONST_0_0
672 673 674 675
#endif
#if COMPLEXCASE == 1
               aux1(2) = CONST_COMPLEX_0_0
#endif
676 677 678
             endif

#ifdef WITH_MPI
679
             if (wantDebug) call obj%timer%start("mpi_communication")
Andreas Marek's avatar
Andreas Marek committed
680
             call mpi_allreduce(aux1, aux2, 2, &
681
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
682
                                MPI_REAL_PRECISION, &
683 684
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
685
                                MPI_COMPLEX_PRECISION, &
686
#endif
Andreas Marek's avatar
Andreas Marek committed
687
                                MPI_SUM, mpi_comm_rows, mpierr)
688
             if (wantDebug) call obj%timer%stop("mpi_communication")
689 690 691

#else /* WITH_MPI */
              aux2 = aux1 ! this should be optimized
Andreas Marek's avatar
Andreas Marek committed
692
#endif
693 694 695 696 697

             vnorm2 = aux2(1)
             vrl    = aux2(2)

             ! Householder transformation
698
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
699
       call hh_transform_real_&
700 701
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
702
       call hh_transform_complex_&
703
#endif
Andreas Marek's avatar
Andreas Marek committed
704
             &PRECISION &
705
                              (obj, vrl, vnorm2, xf, tau, wantDebug)
706
             ! Scale vr and store Householder Vector for back transformation
707 708 709 710 711

             vr(1:lr) = vr(1:lr) * xf
             if (my_prow==prow(nrow, nblk, np_rows)) then
               a(1:lr-1,lch) = vr(1:lr-1)
               a(lr,lch) = vrl
712
#if REALCASE == 1
713
               vr(lr) = CONST_1_0
714 715 716 717
#endif
#if COMPLEXCASE == 1
               vr(lr) = CONST_COMPLEX_1_0
#endif
718 719 720 721 722 723
             else
               a(1:lr,lch) = vr(1:lr)
             endif

           endif

724
           ! Broadcast Householder Vector and tau along columns
725 726 727

           vr(lr+1) = tau
#ifdef WITH_MPI
728
           if (wantDebug) call obj%timer%start("mpi_communication")
Andreas Marek's avatar
Retab  
Andreas Marek committed
729
     call MPI_Bcast(vr, lr+1, &
730
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
731
                          MPI_REAL_PRECISION, &
732 733
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
734
                          MPI_COMPLEX_PRECISION, &
735
#endif
Andreas Marek's avatar
Andreas Marek committed
736
                          cur_pcol, mpi_comm_cols, mpierr)
737
           if (wantDebug) call obj%timer%stop("mpi_communication")
738 739

#endif /* WITH_MPI */
740

741
           if (useGPU) then
742
#if REALCASE == 1
743
             vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
744 745
#endif
#if COMPLEXCASE == 1
746
             vmrCUDA(1:lr,lc) = vr(1:lr)
747
#endif
748 749 750
           else
             vmrCPU(1:lr,lc) = vr(1:lr)
           endif
751 752
           tau = vr(lr+1)

753 754 755 756 757 758
#if REALCASE == 1
           tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
#endif
#if COMPLEXCASE == 1
           tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
#endif
759
           ! Transform remaining columns in current block with Householder Vector
760 761
           ! Local dot product

762
#if REALCASE == 1
763
           aux1 = 0
764 765 766 767
#endif
#if COMPLEXCASE == 1
          aux1 = CONST_COMPLEX_0_0
#endif
768 769

#ifdef WITH_OPENMP
770 771 772
#if 0
 ! original complex implementation without openmp. check performance
            nlc = 0 ! number of local columns
773 774 775 776 777 778 779 780 781 782
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
783
           if (wantDebug) call obj%timer%start("mpi_communication")
784 785 786 787 788 789 790 791 792 793
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
Andreas Marek's avatar
Andreas Marek committed
794

795 796 797
             endif
           enddo

798

799
           if (wantDebug) call obj%timer%stop("mpi_communication")
800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824

#else /* WITH_MPI */
!          if (nlc>0) aux2=aux1

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
             endif
           enddo

#endif /* WITH_MPI */
!
!           ! Transform
!
!           nlc = 0
!           do j=1,lc-1
!             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
!             if (lcx>0) then
!               nlc = nlc+1
!               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
825

826 827
!             endif
!           enddo
828
#endif /* if 0 */
829

830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856
           !Open up one omp region to avoid paying openmp overhead.
           !This does not help performance due to the addition of two openmp barriers around the MPI call,
           !But in the future this may be beneficial if these barriers are replaced with a faster implementation

           !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
           mynlc = 0 ! number of local columns

           !This loop does not have independent iterations,
           !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
           !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
           !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
           !the iteration number modulo the number of threads
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0 ) then
               mynlc = mynlc+1
               if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
                   if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
               endif
             endif
           enddo

           ! Get global dot products

           !$omp barrier
           !$omp single
#ifdef WITH_MPI
857
           if (wantDebug) call obj%timer%start("mpi_communication")
858 859
           if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc,     &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
860
                                     MPI_REAL_PRECISION,    &
861 862 863 864
#endif
#if COMPLEXCASE == 1
                                           MPI_COMPLEX_PRECISION, &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
865
                         MPI_SUM, mpi_comm_rows, mpierr)
866
           if (wantDebug) call obj%timer%stop("mpi_communication")
867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895
#else /* WITH_MPI */
           if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
           !$omp end single
           !$omp barrier

           ! Transform
           transformChunkSize=32
           mynlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               mynlc = mynlc+1
               !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
               !However, for some reason this is slower than doing it manually, so it is parallelized as below.
               do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
                  do pp = 1,transformChunkSize
                      if (pp + ii > lr) exit
#if REALCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
#endif
#if COMPLEXCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp)
#endif
                  enddo
               enddo
             endif
           enddo
           !$omp end parallel
896 897 898 899 900 901 902 903 904 905 906 907 908 909

#else /* WITH_OPENMP */