elpa2_bandred_template.F90 62.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
63 64 65 66
    subroutine bandred_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
67
    (obj, na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
Andreas Marek's avatar
Andreas Marek committed
68
     tmat_dev, wantDebug, useGPU, success, &
69
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
70
     useQR, &
71
#endif
Andreas Marek's avatar
Andreas Marek committed
72
     max_threads)
73

74
  !-------------------------------------------------------------------------------
75
  !  bandred_real/complex: Reduces a distributed symmetric matrix to band form
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
  !
  !  Parameters
  !
  !  na          Order of matrix
  !
  !  a(lda,matrixCols)    Distributed matrix which should be reduced.
  !              Distribution is like in Scalapack.
  !              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
  !              a(:,:) is overwritten on exit with the band and the Householder vectors
  !              in the upper half.
  !
  !  lda         Leading dimension of a
  !  matrixCols  local columns of matrix a
  !
  !  nblk        blocksize of cyclic distribution, must be the same in both directions!
  !
  !  nbw         semi bandwith of output matrix
  !
  !  mpi_comm_rows
  !  mpi_comm_cols
  !              MPI-Communicators for rows/columns
  !
  !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
  !              Factors for the Householder vectors (returned), needed for back transformation
  !
  !-------------------------------------------------------------------------------

      use cuda_functions
      use iso_c_binding
      use elpa1_compute
#ifdef WITH_OPENMP
      use omp_lib
#endif
      use precision
110
      use elpa_abstract_impl
111
      implicit none
112
#include "../general/precision_kinds.F90"
113
      class(elpa_abstract_impl_t), intent(inout) :: obj
114 115 116
      integer(kind=ik)                            :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols

#ifdef USE_ASSUMED_SIZE
117
      MATH_DATATYPE(kind=rck)                    :: a(lda,*), tmat(nbw,nbw,*)
118
#else
119
      MATH_DATATYPE(kind=rck)                    :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
120
#endif
Andreas Marek's avatar
Andreas Marek committed
121 122

#if REALCASE == 1
123
      real(kind=rk)                               :: eps
124 125
#endif
      logical, intent(in)                         :: useGPU
126

127 128 129
      integer(kind=ik)                            :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)                            :: l_cols, l_rows
#if REALCASE == 1
130
      integer(kind=ik)                            :: vmrCols
131
#endif
Andreas Marek's avatar
Andreas Marek committed
132 133 134 135
#ifdef WITH_OPENMP
      integer(kind=ik)                            :: mynlc, lrs, transformChunkSize
#endif
      integer(kind=ik)                            :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
136 137
      integer(kind=ik)                            :: istep, ncol, lch, lcx, nlc
      integer(kind=ik)                            :: tile_size, l_rows_tile, l_cols_tile
138

139 140
      real(kind=rk)                    :: vnorm2
      MATH_DATATYPE(kind=rck)                    :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
141

142
!      complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
143 144 145
      MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:),  vmrCUDA(:),  umcCUDA(:)
      MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
      MATH_DATATYPE(kind=rck), allocatable :: vr(:)
146 147 148 149

#if REALCASE == 1
      ! needed for blocked QR decomposition
      integer(kind=ik)                            :: PQRPARAM(11), work_size
150 151
      real(kind=rk)                    :: dwork_size(1)
      real(kind=rk), allocatable       :: work_blocked(:), tauvector(:), blockheuristic(:)
152
#endif
153
      ! a_dev is passed from bandred_real to trans_ev_band
154
      integer(kind=C_intptr_T)                    :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
155
#ifdef WITH_MPI
156 157 158 159
      integer(kind=ik), external                  :: numroc
#endif
      integer(kind=ik)                            :: ierr
      integer(kind=ik)                            :: cur_l_rows, cur_l_cols, vmr_size, umc_size
Andreas Marek's avatar
Andreas Marek committed
160
      integer(kind=c_intptr_t)                    :: lc_start, lc_end
161
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
162
      integer(kind=c_intptr_t)                    :: lce_1, lcs_1, lre_1
163 164 165 166 167
#endif
      integer(kind=ik)                            :: lr_end
      integer(kind=ik)                            :: na_cols
#if COMPLEXCASE == 1
      integer(kind=ik)                            :: na_rows
168 169
#endif

170 171 172 173 174
      logical, intent(in)                         :: wantDebug
      logical, intent(out)                        :: success
      logical                                     :: successCUDA
      integer(kind=ik)                            :: istat
      character(200)                              :: errorMessage
175
      integer(kind=ik)                            :: min_tile_size, error
176

177 178 179 180
#if REALCASE == 1
      logical, intent(in)                         :: useQR
#endif
      integer(kind=ik)                            :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
181
                                                    ii, pp
182
      integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
183 184 185
                                                                        &PRECISION&
                                                                        &_&
                                                                        &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
186

Andreas Marek's avatar
Andreas Marek committed
187
      logical                                     :: useGPU_reduction_lower_block_to_tridiagonal
Andreas Marek's avatar
Andreas Marek committed
188
      integer(kind=ik), intent(in)                :: max_threads
Andreas Marek's avatar
Andreas Marek committed
189 190


191
      call obj%timer%start("bandred_&
Andreas Marek's avatar
Andreas Marek committed
192 193 194 195
      &MATH_DATATYPE&
      &" // &
      &PRECISION_SUFFIX &
      )
Andreas Marek's avatar
Andreas Marek committed
196
      useGPU_reduction_lower_block_to_tridiagonal = .false.
Andreas Marek's avatar
Andreas Marek committed
197 198 199 200 201 202 203 204 205 206 207 208

      if (useGPU) then
        useGPU_reduction_lower_block_to_tridiagonal = .true.
#if REALCASE == 1
        if (useQR) then
          !in this case switch off GPU usage for step "reduce current block to lower triangular form"
          ! since this is done by QR decomposition
          useGPU_reduction_lower_block_to_tridiagonal = .false.
        endif
#endif
      endif

209
      if (wantDebug) call obj%timer%start("mpi_communication")
210 211 212 213 214

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
Andreas Marek's avatar
Andreas Marek committed
215

216
      if (wantDebug) call obj%timer%stop("mpi_communication")
217 218 219 220 221 222 223
      success = .true.


      ! Semibandwith nbw must be a multiple of blocksize nblk
      if (mod(nbw,nblk)/=0) then
        if (my_prow==0 .and. my_pcol==0) then
          if (wantDebug) then
Andreas Marek's avatar
Andreas Marek committed
224
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Andreas Marek committed
225 226
                                 &MATH_DATATYPE&
                                 &: ERROR: nbw=',nbw,', nblk=',nblk
Andreas Marek's avatar
Andreas Marek committed
227
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Andreas Marek committed
228 229
                                 &MATH_DATATYPE&
                                 &: ELPA2 works only for nbw==n*nblk'
230 231 232 233 234 235
          endif
          success = .false.
          return
        endif
      endif

Andreas Marek's avatar
Andreas Marek committed
236
      ! na_rows in used nowhere; only na_cols
237 238
      if (useGPU) then
#ifdef WITH_MPI
239
#if COMPLEXCASE == 1
240
        na_rows = numroc(na, nblk, my_prow, 0, np_rows)
241
#endif
242 243
        na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
244 245 246
#if COMPLEXCASE == 1
         na_rows = na
#endif
247
        na_cols = na
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
248
#endif /* WITH_MPI */
249 250

        ! Here we convert the regular host array into a pinned host array
251
        successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype)
252
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
253
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
254 255
                  &MATH_DATATYPE&
                  &: error in cudaMalloc a_dev 1"
Andreas Marek's avatar
Andreas Marek committed
256
          stop 1
257 258
        endif

259
        successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
260
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
261
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
262 263
                  &MATH_DATATYPE&
                  &: error in cudaMalloc tmat_dev 1"
Andreas Marek's avatar
Andreas Marek committed
264
          stop 1
265 266
        endif

267
        successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
268
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
269
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
270 271
                  &MATH_DATATYPE&
                  &: error in cudaMalloc vav_dev 1"
Andreas Marek's avatar
Andreas Marek committed
272
          stop 1
273
        endif
274 275 276 277 278
      endif ! useGPU

      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
279 280 281 282 283 284 285 286 287 288 289 290 291 292 293

      ! make tile_size a smallest possible multiple of previously defined tile size, such that it is
      ! larger or equal to min_tile_size
      ! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value
      ! it can, however, be set by the user
      call obj%get("min_tile_size", min_tile_size ,error)
      if (error .ne. ELPA_OK) then
        print *,"Problem setting option. Aborting..."
        stop
      endif
      if(min_tile_size == 0) then
        ! not set by the user, use the default value
        min_tile_size = 128*max(np_rows, np_cols)
      endif
      tile_size = ((min_tile_size-1)/tile_size+1)*tile_size
294 295 296 297

      l_rows_tile = tile_size/np_rows ! local rows of a tile
      l_cols_tile = tile_size/np_cols ! local cols of a tile

298
#if REALCASE == 1
299 300 301
      if (useQR) then

        if (which_qr_decomposition == 1) then
302
          call qr_pqrparam_init(obj,pqrparam(1:11),    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
303 304 305
          allocate(tauvector(na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating tauvector "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
306
            stop 1
307 308 309 310 311
          endif

          allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating blockheuristic "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
312
            stop 1
313 314 315 316 317 318
          endif

          l_rows = local_index(na, my_prow, np_rows, nblk, -1)
          allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
319
            stop 1
320 321 322 323 324
          endif

          vmrCols = na

#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
325
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
326 327
               &PRECISION&
               &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
328 329 330 331
                                 nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
332
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
333 334
               &PRECISION&
               &(obj, a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
335 336 337 338 339
                                 vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
                                 nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)
#endif

340
          work_size = int(dwork_size(1))
341 342 343
          allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating work_blocked "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
344
            stop 1
345
          endif
Pavel Kus's avatar
Pavel Kus committed
346
          work_blocked = 0.0_rk
347 348 349
          deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
350
            stop 1
351 352 353 354 355
          endif

        endif ! which_qr_decomposition

      endif ! useQr
Andreas Marek's avatar
Andreas Marek committed
356
#endif /* REALCASE */
357 358 359 360 361

      if (useGPU) then

        cur_l_rows = 0
        cur_l_cols = 0
Andreas Marek's avatar
Andreas Marek committed
362

363
        successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
364
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
365
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
366 367
                  &MATH_DATATYPE&
                  &: error in cudaMemcpy a_dev 2"
Andreas Marek's avatar
Andreas Marek committed
368
          stop 1
369
        endif
370 371 372 373 374 375 376 377 378 379 380
      endif ! useGPU


      do istep = (na-1)/nbw, 1, -1

        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step

        ! Number of local columns/rows of remaining matrix
        l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
        l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)

381 382
        ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces

383 384 385 386 387 388 389 390 391 392 393 394 395
        if (useGPU) then
          cur_l_rows = max(l_rows, 1)
          cur_l_cols = max(l_cols, 1)

          vmr_size = cur_l_rows * 2 * n_cols
          umc_size = cur_l_cols * 2 * n_cols

          ! Allocate vmr and umc only if the inew size exceeds their current capacity
          ! Added for FORTRAN CALLS
          if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
            if (allocated(vr)) then
              deallocate(vr, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
396
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
397 398
                        &MATH_DATATYPE&
                        &: error when deallocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
399
                stop 1
400 401 402 403
              endif
            endif
            allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
            if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
404
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
405 406
                      &MATH_DATATYPE&
                      &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
407
              stop 1
408 409 410 411 412 413 414 415
            endif

          endif

          if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then
            if (allocated(vmrCUDA)) then
              deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
416
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
417 418
                        &MATH_DATATYPE&
                        &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
419
                stop 1
420 421 422 423
              endif

              successCUDA = cuda_free(vmr_dev)
              if (.not.(successCUDA)) then
424
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
425
                        &MATH_DATATYPE&: error in cuda_free vmr_dev 1"
Andreas Marek's avatar
Andreas Marek committed
426
                stop 1
427 428 429 430
              endif
            endif

            allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage)
431

432
            if (istat .ne. 0) then
433
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
434 435
                      &MATH_DATATYPE&
                      &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
436
              stop 1
437
            endif
438
            successCUDA = cuda_malloc(vmr_dev, vmr_size* size_of_datatype)
439
            if (.not.(successCUDA)) then
440
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
441 442
                      &MATH_DATATYPE&
                      &: error in cudaMalloc: vmr_dev2"
Andreas Marek's avatar
Andreas Marek committed
443
              stop 1
444 445 446
            endif

          endif
447 448 449 450 451

          if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then
            if (allocated(umcCUDA)) then
              deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
452
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
453 454
                        &MATH_DATATYPE&
                        &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
455
                stop 1
456 457 458 459
              endif

              successCUDA = cuda_free(umc_dev)
              if (.not.(successCUDA)) then
460
                 print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
461 462
                         &MATH_DATATYPE&
                         &: error in cudaFree umc_dev 1"
Andreas Marek's avatar
Andreas Marek committed
463
                 stop 1
464 465 466
              endif

            endif
467

468
            allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
469

470
            if (istat .ne. 0) then
471
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
472 473
                      &MATH_DATATYPE&
                      &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
474
              stop 1
475 476
            endif

477
            successCUDA = cuda_malloc(umc_dev, umc_size* size_of_datatype)
478
            if (.not.(successCUDA)) then
479
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
480 481
                      &MATH_DATATYPE&
                      &: error in cudaMalloc umc_dev 2"
Andreas Marek's avatar
Andreas Marek committed
482
              stop 1
483
            endif
484

485
          endif
486 487 488

        else ! GPU not used

489 490 491
          ! unify the the name vmr and vmrCPU, as well as vmrGPU
          ! the same for umcCPU and umcGPU
          ! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces
492 493 494

          allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
495
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
496 497
                     &MATH_DATATYPE&
                     &: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
498
            stop 1
499 500
          endif

501 502
          allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
503
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
504 505
                    &MATH_DATATYPE&
                    &: error when allocating umcCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
506
            stop 1
507
          endif
Andreas Marek's avatar
Andreas Marek committed
508

509 510
          allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
511
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
512 513
                    &MATH_DATATYPE&
                    &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
514
            stop 1
515 516 517 518 519
          endif

        endif ! use GPU

        if (useGPU) then
Pavel Kus's avatar
Pavel Kus committed
520
          vmrCUDA(1 : cur_l_rows * n_cols) = 0.0_rck
521
        else
Pavel Kus's avatar
Pavel Kus committed
522
          vmrCPU(1:l_rows,1:n_cols) = 0.0_rck
523 524
        endif ! useGPU

Pavel Kus's avatar
Pavel Kus committed
525 526
        vr(:) = 0.0_rck
        tmat(:,:,istep) = 0.0_rck
527
        if (useGPU) then
528
#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
529
          umcCUDA(1 : umc_size) = 0.0_rck
530
#endif
531 532 533 534
          lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
          lc_end   = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
          lr_end   = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)

535
          if (lc_start .le. 0) lc_start = 1
536 537 538 539

          ! Here we assume that the processor grid and the block grid are aligned
          cur_pcol = pcol(istep*nbw+1, nblk, np_cols)

540 541
          if (my_pcol == cur_pcol) then
            successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), &
Andreas Marek's avatar
Retab  
Andreas Marek committed
542
                                      int((lda*size_of_datatype),kind=c_intptr_t), &
543 544 545 546
                                            (a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )),      &
                                            int(lda*size_of_datatype,kind=c_intptr_t),              &
                                            int(lr_end*size_of_datatype,kind=c_intptr_t),           &
                                             int((lc_end - lc_start+1),kind=c_intptr_t),int(cudaMemcpyDeviceToHost,kind=c_int))
547

548

549
            if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
550
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
551 552
                      &MATH_DATATYPE&
                      &: error in cudaMemcpy2d"
Andreas Marek's avatar
Andreas Marek committed
553
              stop 1
554 555 556 557 558
            endif
          endif
        endif ! useGPU

        ! Reduce current block to lower triangular form
559
#if REALCASE == 1
560
        if (useQR) then
Andreas Marek's avatar
Andreas Marek committed
561 562 563 564
          if (useGPU) then
 !            vmrCPU(1:cur_l_rows,1:n_cols) = vmrCUDA(1 : cur_l_rows * n_cols)
          endif

565 566 567
          if (which_qr_decomposition == 1) then
            vmrCols = 2*n_cols
#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
568
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
569 570
                 &PRECISION&
                 &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
571 572 573 574 575 576 577
                                   na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size,        &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
578
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
579 580
                 &PRECISION&
                 &(obj, a(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) ,   &
581 582 583 584 585 586 587 588 589 590
                                    max(l_rows,1), vmrCols, tauvector(1:na), na, &
                                     tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)
#endif
          endif

       else !useQR
591
#endif /* REALCASE == 1 */
592 593
         do lc = n_cols, 1, -1

594
           ncol = istep*nbw + lc ! absolute column number of householder Vector
595 596 597 598 599 600 601 602 603 604 605 606 607
           nrow = ncol - nbw ! Absolute number of pivot row

           lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
           lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number

           tau = 0

           if (nrow == 1) exit ! Nothing to do

           cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block

           if (my_pcol==cur_pcol) then

608
             ! Get Vector to be transformed; distribute last element and norm of
609 610
             ! remaining elements to all procs in current column

611
             vr(1:lr) = a(1:lr,lch) ! Vector to be transformed
612 613 614 615 616 617

             if (my_prow==prow(nrow, nblk, np_rows)) then
               aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
               aux1(2) = vr(lr)
             else
               aux1(1) = dot_product(vr(1:lr),vr(1:lr))
Pavel Kus's avatar
Pavel Kus committed
618
               aux1(2) = 0.0_rck
619 620 621
             endif

#ifdef WITH_MPI
622
             if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
623
             call mpi_allreduce(aux1, aux2, 2, MPI_MATH_DATATYPE_PRECISION, &
Andreas Marek's avatar
Andreas Marek committed
624
                                MPI_SUM, mpi_comm_rows, mpierr)
625
             if (wantDebug) call obj%timer%stop("mpi_communication")
626 627 628

#else /* WITH_MPI */
              aux2 = aux1 ! this should be optimized
Andreas Marek's avatar
Andreas Marek committed
629
#endif
630

631
#if REALCASE == 1
632
             vnorm2 = aux2(1)
633 634 635 636
#endif
#if COMPLEXCASE == 1
             vnorm2 = real(aux2(1),kind=rk)
#endif
637 638 639
             vrl    = aux2(2)

             ! Householder transformation
Pavel Kus's avatar
Pavel Kus committed
640 641 642
       call hh_transform_&
             &MATH_DATATYPE&
             &_&
Andreas Marek's avatar
Andreas Marek committed
643
             &PRECISION &
Pavel Kus's avatar
Pavel Kus committed
644
                         (obj, vrl, vnorm2, xf, tau, wantDebug)
645
             ! Scale vr and store Householder Vector for back transformation
646 647 648 649 650

             vr(1:lr) = vr(1:lr) * xf
             if (my_prow==prow(nrow, nblk, np_rows)) then
               a(1:lr-1,lch) = vr(1:lr-1)
               a(lr,lch) = vrl
Pavel Kus's avatar
Pavel Kus committed
651
               vr(lr) = 1.0_rck
652 653 654 655 656 657
             else
               a(1:lr,lch) = vr(1:lr)
             endif

           endif

658
           ! Broadcast Householder Vector and tau along columns
659 660 661

           vr(lr+1) = tau
#ifdef WITH_MPI
662
           if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
663
           call MPI_Bcast(vr, lr+1, MPI_MATH_DATATYPE_PRECISION, &
Andreas Marek's avatar
Andreas Marek committed
664
                          cur_pcol, mpi_comm_cols, mpierr)
665
           if (wantDebug) call obj%timer%stop("mpi_communication")
666 667

#endif /* WITH_MPI */
668

Andreas Marek's avatar
Andreas Marek committed
669
           if (useGPU_reduction_lower_block_to_tridiagonal) then
670
             vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
671 672 673
           else
             vmrCPU(1:lr,lc) = vr(1:lr)
           endif
674 675
           tau = vr(lr+1)

676 677 678 679 680 681
#if REALCASE == 1
           tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
#endif
#if COMPLEXCASE == 1
           tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
#endif
682
           ! Transform remaining columns in current block with Householder Vector
683 684
           ! Local dot product

Pavel Kus's avatar
Pavel Kus committed
685
           aux1 = 0.0_rck
686 687

#ifdef WITH_OPENMP
688 689 690
#if 0
 ! original complex implementation without openmp. check performance
            nlc = 0 ! number of local columns
691 692 693 694 695 696 697 698 699 700
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
701
           if (wantDebug) call obj%timer%start("mpi_communication")
702 703 704 705 706 707 708 709 710 711
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
Andreas Marek's avatar
Andreas Marek committed
712

713 714 715
             endif
           enddo

716

717
           if (wantDebug) call obj%timer%stop("mpi_communication")
718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742

#else /* WITH_MPI */
!          if (nlc>0) aux2=aux1

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
             endif
           enddo

#endif /* WITH_MPI */
!
!           ! Transform
!
!           nlc = 0
!           do j=1,lc-1
!             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
!             if (lcx>0) then
!               nlc = nlc+1
!               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
743

744 745
!             endif
!           enddo
746
#endif /* if 0 */
747

748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774
           !Open up one omp region to avoid paying openmp overhead.
           !This does not help performance due to the addition of two openmp barriers around the MPI call,
           !But in the future this may be beneficial if these barriers are replaced with a faster implementation

           !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
           mynlc = 0 ! number of local columns

           !This loop does not have independent iterations,
           !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
           !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
           !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
           !the iteration number modulo the number of threads
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0 ) then
               mynlc = mynlc+1
               if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
                   if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
               endif
             endif
           enddo

           ! Get global dot products

           !$omp barrier
           !$omp single
#ifdef WITH_MPI
775
           if (wantDebug) call obj%timer%start("mpi_communication")
776
           if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, MPI_MATH_DATATYPE_PRECISION, &
Pavel Kus's avatar
Pavel Kus committed
777
                                           MPI_SUM, mpi_comm_rows, mpierr)
778
           if (wantDebug) call obj%timer%stop("mpi_communication")
779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807
#else /* WITH_MPI */
           if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
           !$omp end single
           !$omp barrier

           ! Transform
           transformChunkSize=32
           mynlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               mynlc = mynlc+1
               !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
               !However, for some reason this is slower than doing it manually, so it is parallelized as below.
               do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
                  do pp = 1,transformChunkSize
                      if (pp + ii > lr) exit
#if REALCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
#endif
#if COMPLEXCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp)
#endif
                  enddo
               enddo
             endif
           enddo
           !$omp end parallel
808 809 810 811 812 813 814 815 816 817 818 819 820 821

#else /* WITH_OPENMP */

           nlc = 0 ! number of local columns
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
822
           if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
823 824
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_MATH_DATATYPE_PRECISION, &
                                         MPI_SUM, mpi_comm_rows, mpierr)
825
           if (wantDebug) call obj%timer%stop("mpi_communication")
826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844
#else /* WITH_MPI */
           if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
#if REALCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
#endif
#if COMPLEXCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
#endif
             endif
           enddo
#endif /* WITH_OPENMP */
845 846
         enddo ! lc

Andreas Marek's avatar
Andreas Marek committed
847
         if (useGPU_reduction_lower_block_to_tridiagonal) then
848 849 850
           ! store column tiles back to GPU
           cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
           if (my_pcol == cur_pcol) then
851
             successCUDA = cuda_memcpy2d((a_dev+        &
852 853 854 855 856
                                         int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)),    &
                                         int(lda*size_of_datatype,kind=c_intptr_t), loc(a(1,lc_start)), &
                                         int(lda*size_of_datatype,kind=c_intptr_t),           &
                                         int(lr_end*size_of_datatype,kind=c_intptr_t),        &
                                         int((lc_end - lc_start+1),kind=c_intptr_t), &
857
                                         int(cudaMemcpyHostToDevice,kind=c_int))
858

859
             if (.not.(successCUDA)) then
860
               print *, "bandred_&
861 862
                        &MATH_DATATYPE&
                        &: cuda memcpy a_dev  failed ", istat
Andreas Marek's avatar
Andreas Marek committed
863
               stop 1
864
             endif
865 866 867 868 869 870 871
           endif
         endif

         ! Calculate scalar products of stored Householder vectors.
         ! This can be done in different ways, we use dsyrk

         vav = 0
872
         call obj%timer%start("blas")
Andreas Marek's avatar
Andreas Marek committed
873
         if (useGPU_reduction_lower_block_to_tridiagonal) then
874
           if (l_rows>0) &
Andreas Marek's avatar
Andreas Marek committed
875 876 877 878 879 880
#if REALCASE == 1
             call PRECISION_SYRK('U', 'T',            &
#endif
#if COMPLEXCASE == 1
             call PRECISION_HERK('U', 'C',            &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
881
                           n_cols, l_rows, ONE, &
882 883 884
                           vmrCUDA, cur_l_rows, &
                           ZERO, vav, ubound(vav,dim=1))

Andreas Marek's avatar
Andreas Marek committed
885
         else ! useGPU_reduction_to_tridiagonal
886
           if (l_rows>0) &
Andreas Marek's avatar
Andreas Marek committed
887 888
#if REALCASE == 1
             call PRECISION_SYRK('U', 'T',           &
889 890
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
891
             call PRECISION_HERK('U', 'C',           &
892
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
893
                           n_cols, l_rows, ONE, vmrCPU, ubound(vmrCPU,dim=1), ZERO, vav, ubound(vav,dim=1))
Andreas Marek's avatar
Andreas Marek committed
894
         endif
895
         call obj%timer%stop("blas")
896
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
897
         call symm_matrix_allreduce_&
898 899
#endif
#if COMPLEXCASE == 1
900
         call herm_matrix_allreduce_&