elpa2_bandred_template.F90 64.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
63 64 65 66
    subroutine bandred_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
67
    (obj, na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
Andreas Marek's avatar
Andreas Marek committed
68
     tmat_dev, wantDebug, useGPU, success &
69
#if REALCASE == 1
70
     , useQR)
71 72
#endif
#if COMPLEXCASE == 1
73
     )
74
#endif
75

76
  !-------------------------------------------------------------------------------
77
  !  bandred_real/complex: Reduces a distributed symmetric matrix to band form
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
  !
  !  Parameters
  !
  !  na          Order of matrix
  !
  !  a(lda,matrixCols)    Distributed matrix which should be reduced.
  !              Distribution is like in Scalapack.
  !              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
  !              a(:,:) is overwritten on exit with the band and the Householder vectors
  !              in the upper half.
  !
  !  lda         Leading dimension of a
  !  matrixCols  local columns of matrix a
  !
  !  nblk        blocksize of cyclic distribution, must be the same in both directions!
  !
  !  nbw         semi bandwith of output matrix
  !
  !  mpi_comm_rows
  !  mpi_comm_cols
  !              MPI-Communicators for rows/columns
  !
  !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
  !              Factors for the Householder vectors (returned), needed for back transformation
  !
  !-------------------------------------------------------------------------------

      use cuda_functions
      use iso_c_binding
      use elpa1_compute
#ifdef WITH_OPENMP
      use omp_lib
#endif
      use precision
112
      use elpa_abstract_impl
113 114
      implicit none

115
      class(elpa_abstract_impl_t), intent(inout) :: obj
116 117 118
      integer(kind=ik)                            :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols

#if REALCASE == 1
119
#ifdef USE_ASSUMED_SIZE
120
      real(kind=REAL_DATATYPE)                    :: a(lda,*), tmat(nbw,nbw,*)
121
#else
122
      real(kind=REAL_DATATYPE)                    :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
123
#endif
124 125 126 127 128 129 130
#endif
#if COMPLEXCASE == 1
#ifdef USE_ASSUMED_SIZE
      complex(kind=COMPLEX_DATATYPE)              :: a(lda,*), tmat(nbw,nbw,*)
#else
      complex(kind=COMPLEX_DATATYPE)              :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
#endif
Andreas Marek's avatar
Andreas Marek committed
131 132 133 134 135 136 137 138 139 140 141 142
#endif /* COMPLEXCASE */

#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
      real(kind=REAL_DATATYPE), parameter         :: ZERO = 0.0_rk8, ONE = 1.0_rk8

#else
      real(kind=REAL_DATATYPE), parameter         :: ZERO = 0.0_rk4, ONE = 1.0_rk4
#endif
#endif

#if COMPLEXCASE == 1
143
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
144
      complex(kind=COMPLEX_DATATYPE), parameter   :: ZERO = (0.0_rk8, 0.0_rk8), ONE = (1.0_rk8, 0.0_rk8)
145
#else
Andreas Marek's avatar
Andreas Marek committed
146
      complex(kind=COMPLEX_DATATYPE), parameter   :: ZERO = (0.0_rk4, 0.0_rk4), ONE = (1.0_rk4, 0.0_rk4)
147 148
#endif
#endif /* COMPLEXCASE == 1 */
149

150
      logical, intent(in)                         :: useGPU
151

152 153 154
      integer(kind=ik)                            :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)                            :: l_cols, l_rows
#if REALCASE == 1
155
      integer(kind=ik)                            :: vmrCols
156
#endif
Andreas Marek's avatar
Andreas Marek committed
157 158 159 160
#ifdef WITH_OPENMP
      integer(kind=ik)                            :: mynlc, lrs, transformChunkSize
#endif
      integer(kind=ik)                            :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
161 162
      integer(kind=ik)                            :: istep, ncol, lch, lcx, nlc
      integer(kind=ik)                            :: tile_size, l_rows_tile, l_cols_tile
163

164 165 166 167 168 169
      real(kind=REAL_DATATYPE)                    :: vnorm2
#if REALCASE == 1
      real(kind=REAL_DATATYPE)                    :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE)              :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
170
#endif
171

172
#if COMPLEXCASE == 1
173 174
!      complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
      complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:),  vmrCUDA(:),  umcCUDA(:)
175
      complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
Andreas Marek's avatar
Andreas Marek committed
176
      complex(kind=COMPLEX_DATATYPE), allocatable :: vr(:)
177
#endif
178

179 180 181 182 183 184 185 186 187 188 189 190
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable       :: tmpCUDA(:),  vmrCUDA(:),  umcCUDA(:)
      real(kind=REAL_DATATYPE), allocatable       :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
      real(kind=REAL_DATATYPE), allocatable       :: vr(:)
#endif

#if REALCASE == 1
      ! needed for blocked QR decomposition
      integer(kind=ik)                            :: PQRPARAM(11), work_size
      real(kind=REAL_DATATYPE)                    :: dwork_size(1)
      real(kind=REAL_DATATYPE), allocatable       :: work_blocked(:), tauvector(:), blockheuristic(:)
#endif
191
      ! a_dev is passed from bandred_real to trans_ev_band
192
      integer(kind=C_intptr_T)                    :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
193
#ifdef WITH_MPI
194 195 196 197
      integer(kind=ik), external                  :: numroc
#endif
      integer(kind=ik)                            :: ierr
      integer(kind=ik)                            :: cur_l_rows, cur_l_cols, vmr_size, umc_size
198
      integer(kind=c_intptr_t)                      :: lc_start, lc_end
199
#if COMPLEXCASE == 1
200
      integer(kind=c_intptr_t)                      :: lce_1, lcs_1, lre_1
201 202 203 204 205
#endif
      integer(kind=ik)                            :: lr_end
      integer(kind=ik)                            :: na_cols
#if COMPLEXCASE == 1
      integer(kind=ik)                            :: na_rows
206 207
#endif

208 209 210 211 212
      logical, intent(in)                         :: wantDebug
      logical, intent(out)                        :: success
      logical                                     :: successCUDA
      integer(kind=ik)                            :: istat
      character(200)                              :: errorMessage
213

214 215 216 217
#if REALCASE == 1
      logical, intent(in)                         :: useQR
#endif
      integer(kind=ik)                            :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
Andreas Marek's avatar
Andreas Marek committed
218
                                                    ii, pp
219
      integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
220 221 222
                                                                        &PRECISION&
                                                                        &_&
                                                                        &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
223

224
      call obj%timer%start("bandred_&
Andreas Marek's avatar
Andreas Marek committed
225 226 227 228
      &MATH_DATATYPE&
      &" // &
      &PRECISION_SUFFIX &
      )
229
      if (wantDebug) call obj%timer%start("mpi_communication")
230 231 232 233 234

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
Andreas Marek's avatar
Andreas Marek committed
235

236
      if (wantDebug) call obj%timer%stop("mpi_communication")
237 238 239 240 241 242 243
      success = .true.


      ! Semibandwith nbw must be a multiple of blocksize nblk
      if (mod(nbw,nblk)/=0) then
        if (my_prow==0 .and. my_pcol==0) then
          if (wantDebug) then
Andreas Marek's avatar
Andreas Marek committed
244
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
245 246
      &MATH_DATATYPE&
      &: ERROR: nbw=',nbw,', nblk=',nblk
Andreas Marek's avatar
Andreas Marek committed
247
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
248 249
      &MATH_DATATYPE&
      &: ELPA2 works only for nbw==n*nblk'
250 251 252 253 254 255 256 257 258
          endif
          success = .false.
          return
        endif
      endif

! na_rows in used nowhere; only na_cols
      if (useGPU) then
#ifdef WITH_MPI
259
#if COMPLEXCASE == 1
260
        na_rows = numroc(na, nblk, my_prow, 0, np_rows)
261
#endif
262 263
        na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
264 265 266
#if COMPLEXCASE == 1
         na_rows = na
#endif
267
        na_cols = na
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
268
#endif /* WITH_MPI */
269 270

        ! Here we convert the regular host array into a pinned host array
271
        successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype)
272
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
273
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
274 275
                  &MATH_DATATYPE&
                  &: error in cudaMalloc a_dev 1"
Andreas Marek's avatar
Andreas Marek committed
276
          stop 1
277 278
        endif

279
        successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
280
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
281
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
282 283
                  &MATH_DATATYPE&
                  &: error in cudaMalloc tmat_dev 1"
Andreas Marek's avatar
Andreas Marek committed
284
          stop 1
285 286
        endif

287
        successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
288
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
289
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
290 291
                  &MATH_DATATYPE&
                  &: error in cudaMalloc vav_dev 1"
Andreas Marek's avatar
Andreas Marek committed
292
          stop 1
293
        endif
294 295 296 297 298 299 300 301 302 303
      endif ! useGPU

      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

      l_rows_tile = tile_size/np_rows ! local rows of a tile
      l_cols_tile = tile_size/np_cols ! local cols of a tile

304
#if REALCASE == 1
305 306 307 308
      if (useQR) then

        if (useGPU) then
          print *,"qr decomposition at the moment not supported with GPU"
Andreas Marek's avatar
Andreas Marek committed
309
          stop 1
310 311 312
        endif

        if (which_qr_decomposition == 1) then
313
          call qr_pqrparam_init(obj,pqrparam(1:11),    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
314 315 316
          allocate(tauvector(na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating tauvector "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
317
            stop 1
318 319 320 321 322
          endif

          allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating blockheuristic "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
323
            stop 1
324 325 326 327 328 329
          endif

          l_rows = local_index(na, my_prow, np_rows, nblk, -1)
          allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
330
            stop 1
331 332 333 334 335
          endif

          vmrCols = na

#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
336
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
337 338
               &PRECISION&
               &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
339 340 341 342
                                 nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
343
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
344 345
               &PRECISION&
               &(obj, a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
346 347 348 349 350 351 352 353 354
                                 vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
                                 nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)
#endif

          work_size = dwork_size(1)
          allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating work_blocked "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
355
            stop 1
356
          endif
357
          work_blocked = CONST_0_0
358 359 360
          deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
361
            stop 1
362 363 364 365 366
          endif

        endif ! which_qr_decomposition

      endif ! useQr
Andreas Marek's avatar
Andreas Marek committed
367
#endif /* REALCASE */
368 369 370 371 372

      if (useGPU) then

        cur_l_rows = 0
        cur_l_cols = 0
Andreas Marek's avatar
Andreas Marek committed
373

374
        successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
375
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
376
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
377 378
                  &MATH_DATATYPE&
                  &: error in cudaMemcpy a_dev 2"
Andreas Marek's avatar
Andreas Marek committed
379
          stop 1
380
        endif
381 382 383 384 385 386 387 388 389 390 391
      endif ! useGPU


      do istep = (na-1)/nbw, 1, -1

        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step

        ! Number of local columns/rows of remaining matrix
        l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
        l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)

392 393
        ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces

394 395 396 397 398 399 400 401 402 403 404 405 406
        if (useGPU) then
          cur_l_rows = max(l_rows, 1)
          cur_l_cols = max(l_cols, 1)

          vmr_size = cur_l_rows * 2 * n_cols
          umc_size = cur_l_cols * 2 * n_cols

          ! Allocate vmr and umc only if the inew size exceeds their current capacity
          ! Added for FORTRAN CALLS
          if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
            if (allocated(vr)) then
              deallocate(vr, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
407
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
408 409
                        &MATH_DATATYPE&
                        &: error when deallocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
410
                stop 1
411 412 413 414
              endif
            endif
            allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
            if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
415
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
416 417
                      &MATH_DATATYPE&
                      &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
418
              stop 1
419 420 421 422 423 424 425 426
            endif

          endif

          if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then
            if (allocated(vmrCUDA)) then
              deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
427
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
428 429
                        &MATH_DATATYPE&
                        &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
430
                stop 1
431 432 433 434
              endif

              successCUDA = cuda_free(vmr_dev)
              if (.not.(successCUDA)) then
435
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
436
                        &MATH_DATATYPE&: error in cuda_free vmr_dev 1"
Andreas Marek's avatar
Andreas Marek committed
437
                stop 1
438 439 440 441
              endif
            endif

            allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage)
442

443
            if (istat .ne. 0) then
444
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
445 446
                      &MATH_DATATYPE&
                      &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
447
              stop 1
448
            endif
449
            successCUDA = cuda_malloc(vmr_dev, vmr_size* size_of_datatype)
450
            if (.not.(successCUDA)) then
451
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
452 453
                      &MATH_DATATYPE&
                      &: error in cudaMalloc: vmr_dev2"
Andreas Marek's avatar
Andreas Marek committed
454
              stop 1
455 456 457
            endif

          endif
458 459 460 461 462

          if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then
            if (allocated(umcCUDA)) then
              deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
463
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
464 465
                        &MATH_DATATYPE&
                        &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
466
                stop 1
467 468 469 470
              endif

              successCUDA = cuda_free(umc_dev)
              if (.not.(successCUDA)) then
471
                 print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
472 473
                         &MATH_DATATYPE&
                         &: error in cudaFree umc_dev 1"
Andreas Marek's avatar
Andreas Marek committed
474
                 stop 1
475 476 477
              endif

            endif
478

479
            allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
480

481
            if (istat .ne. 0) then
482
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
483 484
                      &MATH_DATATYPE&
                      &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
485
              stop 1
486 487
            endif

488
            successCUDA = cuda_malloc(umc_dev, umc_size* size_of_datatype)
489
            if (.not.(successCUDA)) then
490
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
491 492
                      &MATH_DATATYPE&
                      &: error in cudaMalloc umc_dev 2"
Andreas Marek's avatar
Andreas Marek committed
493
              stop 1
494
            endif
495

496
          endif
497 498 499

        else ! GPU not used

500 501 502
          ! unify the the name vmr and vmrCPU, as well as vmrGPU
          ! the same for umcCPU and umcGPU
          ! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces
503 504 505

          allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
506
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
507 508
                     &MATH_DATATYPE&
                     &: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
509
            stop 1
510 511
          endif

512 513
          allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
514
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
515 516
                    &MATH_DATATYPE&
                    &: error when allocating umcCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
517
            stop 1
518
          endif
Andreas Marek's avatar
Andreas Marek committed
519

520 521
          allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
522
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
523 524
                    &MATH_DATATYPE&
                    &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
525
            stop 1
526 527 528 529 530
          endif

        endif ! use GPU

        if (useGPU) then
531
#if REALCASE == 1
532
          vmrCUDA(1 : cur_l_rows * n_cols) = CONST_0_0
533 534
#endif
#if COMPLEXCASE == 1
535
          vmrCUDA(1: cur_l_rows * n_cols) = CONST_COMPLEX_0_0
536
#endif
537
        else
538
#if REALCASE == 1
539
          vmrCPU(1:l_rows,1:n_cols) = CONST_0_0
540 541 542 543 544 545 546
#endif
#if COMPLEXCASE == 1
          vmrCPU(1:l_rows,1:n_cols) = CONST_COMPLEX_0_0
#endif
        endif ! useGPU

#if REALCASE == 1
547 548
        vr(:) = CONST_0_0
        tmat(:,:,istep) = CONST_0_0
549 550 551 552 553
#endif
#if COMPLEXCASE == 1
        vr(:) = CONST_COMPLEX_0_0
        tmat(:,:,istep) = CONST_COMPLEX_0_0
#endif
554
        if (useGPU) then
555
#if REALCASE == 1
556
          umcCUDA(1 : umc_size) = CONST_0_0
557
#endif
558 559 560 561
          lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
          lc_end   = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
          lr_end   = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)

562
          if (lc_start .le. 0) lc_start = 1
563 564 565 566

          ! Here we assume that the processor grid and the block grid are aligned
          cur_pcol = pcol(istep*nbw+1, nblk, np_cols)

567 568
          if (my_pcol == cur_pcol) then
            successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), &
Andreas Marek's avatar
Retab  
Andreas Marek committed
569
                                      int((lda*size_of_datatype),kind=c_intptr_t), &
570 571 572 573
                                            (a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )),      &
                                            int(lda*size_of_datatype,kind=c_intptr_t),              &
                                            int(lr_end*size_of_datatype,kind=c_intptr_t),           &
                                             int((lc_end - lc_start+1),kind=c_intptr_t),int(cudaMemcpyDeviceToHost,kind=c_int))
574

575

576
            if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
577
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
578 579
                      &MATH_DATATYPE&
                      &: error in cudaMemcpy2d"
Andreas Marek's avatar
Andreas Marek committed
580
              stop 1
581 582 583 584 585
            endif
          endif
        endif ! useGPU

        ! Reduce current block to lower triangular form
586
#if REALCASE == 1
587 588 589 590
        if (useQR) then
          if (which_qr_decomposition == 1) then
            vmrCols = 2*n_cols
#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
591
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
592 593
                 &PRECISION&
                 &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
594 595 596 597 598 599 600
                                   na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size,        &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
601
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
602 603
                 &PRECISION&
                 &(obj, a(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) ,   &
604 605 606 607 608 609 610 611 612 613
                                    max(l_rows,1), vmrCols, tauvector(1:na), na, &
                                     tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)
#endif
          endif

       else !useQR
614
#endif /* REALCASE == 1 */
615 616
         do lc = n_cols, 1, -1

617
           ncol = istep*nbw + lc ! absolute column number of householder Vector
618 619 620 621 622 623 624 625 626 627 628 629 630
           nrow = ncol - nbw ! Absolute number of pivot row

           lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
           lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number

           tau = 0

           if (nrow == 1) exit ! Nothing to do

           cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block

           if (my_pcol==cur_pcol) then

631
             ! Get Vector to be transformed; distribute last element and norm of
632 633
             ! remaining elements to all procs in current column

634
             vr(1:lr) = a(1:lr,lch) ! Vector to be transformed
635 636 637 638 639 640

             if (my_prow==prow(nrow, nblk, np_rows)) then
               aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
               aux1(2) = vr(lr)
             else
               aux1(1) = dot_product(vr(1:lr),vr(1:lr))
641
#if REALCASE == 1
642
               aux1(2) = CONST_0_0
643 644 645 646
#endif
#if COMPLEXCASE == 1
               aux1(2) = CONST_COMPLEX_0_0
#endif
647 648 649
             endif

#ifdef WITH_MPI
650
             if (wantDebug) call obj%timer%start("mpi_communication")
Andreas Marek's avatar
Andreas Marek committed
651
             call mpi_allreduce(aux1, aux2, 2, &
652
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
653
                                MPI_REAL_PRECISION, &
654 655
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
656
                                MPI_COMPLEX_PRECISION, &
657
#endif
Andreas Marek's avatar
Andreas Marek committed
658
                                MPI_SUM, mpi_comm_rows, mpierr)
659
             if (wantDebug) call obj%timer%stop("mpi_communication")
660 661 662

#else /* WITH_MPI */
              aux2 = aux1 ! this should be optimized
Andreas Marek's avatar
Andreas Marek committed
663
#endif
664 665 666 667 668

             vnorm2 = aux2(1)
             vrl    = aux2(2)

             ! Householder transformation
669
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
670
       call hh_transform_real_&
671 672
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
673
       call hh_transform_complex_&
674
#endif
Andreas Marek's avatar
Andreas Marek committed
675
             &PRECISION &
676
                              (obj, vrl, vnorm2, xf, tau, wantDebug)
677
             ! Scale vr and store Householder Vector for back transformation
678 679 680 681 682

             vr(1:lr) = vr(1:lr) * xf
             if (my_prow==prow(nrow, nblk, np_rows)) then
               a(1:lr-1,lch) = vr(1:lr-1)
               a(lr,lch) = vrl
683
#if REALCASE == 1
684
               vr(lr) = CONST_1_0
685 686 687 688
#endif
#if COMPLEXCASE == 1
               vr(lr) = CONST_COMPLEX_1_0
#endif
689 690 691 692 693 694
             else
               a(1:lr,lch) = vr(1:lr)
             endif

           endif

695
           ! Broadcast Householder Vector and tau along columns
696 697 698

           vr(lr+1) = tau
#ifdef WITH_MPI
699
           if (wantDebug) call obj%timer%start("mpi_communication")
700
           call MPI_Bcast(vr, lr+1, &
701
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
702
                          MPI_REAL_PRECISION, &
703 704
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
705
                          MPI_COMPLEX_PRECISION, &
706
#endif
Andreas Marek's avatar
Andreas Marek committed
707
                          cur_pcol, mpi_comm_cols, mpierr)
708
           if (wantDebug) call obj%timer%stop("mpi_communication")
709 710

#endif /* WITH_MPI */
711

712 713
           if (useGPU) then
             vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
714 715 716
           else
             vmrCPU(1:lr,lc) = vr(1:lr)
           endif
717 718
           tau = vr(lr+1)

719 720 721 722 723 724
#if REALCASE == 1
           tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
#endif
#if COMPLEXCASE == 1
           tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
#endif
725
           ! Transform remaining columns in current block with Householder Vector
726 727
           ! Local dot product

728
#if REALCASE == 1
729
           aux1 = 0
730 731 732 733
#endif
#if COMPLEXCASE == 1
          aux1 = CONST_COMPLEX_0_0
#endif
734 735

#ifdef WITH_OPENMP
736 737 738
#if 0
 ! original complex implementation without openmp. check performance
            nlc = 0 ! number of local columns
739 740 741 742 743 744 745 746 747 748
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
749
           if (wantDebug) call obj%timer%start("mpi_communication")
750 751 752 753 754 755 756 757 758 759
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
Andreas Marek's avatar
Andreas Marek committed
760

761 762 763
             endif
           enddo

764

765
           if (wantDebug) call obj%timer%stop("mpi_communication")
766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790

#else /* WITH_MPI */
!          if (nlc>0) aux2=aux1

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
             endif
           enddo

#endif /* WITH_MPI */
!
!           ! Transform
!
!           nlc = 0
!           do j=1,lc-1
!             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
!             if (lcx>0) then
!               nlc = nlc+1
!               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
791

792 793
!             endif
!           enddo
794
#endif /* if 0 */
795

796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822
           !Open up one omp region to avoid paying openmp overhead.
           !This does not help performance due to the addition of two openmp barriers around the MPI call,
           !But in the future this may be beneficial if these barriers are replaced with a faster implementation

           !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
           mynlc = 0 ! number of local columns

           !This loop does not have independent iterations,
           !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
           !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
           !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
           !the iteration number modulo the number of threads
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0 ) then
               mynlc = mynlc+1
               if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
                   if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
               endif
             endif
           enddo

           ! Get global dot products

           !$omp barrier
           !$omp single
#ifdef WITH_MPI
823
           if (wantDebug) call obj%timer%start("mpi_communication")
824 825
           if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc,     &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
826
                                     MPI_REAL_PRECISION,    &
827 828 829 830
#endif
#if COMPLEXCASE == 1
                                           MPI_COMPLEX_PRECISION, &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
831
                         MPI_SUM, mpi_comm_rows, mpierr)
832
           if (wantDebug) call obj%timer%stop("mpi_communication")
833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861
#else /* WITH_MPI */
           if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
           !$omp end single
           !$omp barrier

           ! Transform
           transformChunkSize=32
           mynlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               mynlc = mynlc+1
               !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
               !However, for some reason this is slower than doing it manually, so it is parallelized as below.
               do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
                  do pp = 1,transformChunkSize
                      if (pp + ii > lr) exit
#if REALCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
#endif
#if COMPLEXCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp)
#endif
                  enddo
               enddo
             endif
           enddo
           !$omp end parallel
862 863 864 865 866 867 868 869 870 871 872 873 874 875

#else /* WITH_OPENMP */

           nlc = 0 ! number of local columns
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
876
           if (wantDebug) call obj%timer%start("mpi_communication")
877 878
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc,      &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
879
                                   MPI_REAL_PRECISION,   &
880 881 882
#endif
#if COMPLEXCASE == 1
                                         MPI_COMPLEX_PRECISION,&
883
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
884
           MPI_SUM, mpi_comm_rows, mpierr)
885
           if (wantDebug) call obj%timer%stop("mpi_communication")
886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904
#else /* WITH_MPI */
           if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
#if REALCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
#endif
#if COMPLEXCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
#endif
             endif
           enddo
#endif /* WITH_OPENMP */
905 906 907 908 909 910
         enddo ! lc

         if (useGPU) then
           ! store column tiles back to GPU
           cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
           if (my_pcol == cur_pcol) then
911
             successCUDA = cuda_memcpy2d((a_dev+        &
912 913 914 915 916
                                         int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)),    &
                                         int(lda*size_of_datatype,kind=c_intptr_t), loc(a(1,lc_start)), &
                                         int(lda*size_of_datatype,kind=c_intptr_t),           &
                                         int(lr_end*size_of_datatype,kind=c_intptr_t),        &
                                         int((lc_end - lc_start+1),kind=c_intptr_t), &
917
                                         int(cudaMemcpyHostToDevice,kind=c_int))