elpa2_bandred_template.F90 61.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
63 64 65 66
    subroutine bandred_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
67
    (obj, na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
Andreas Marek's avatar
Andreas Marek committed
68
     tmat_dev, wantDebug, useGPU, success &
69
#if REALCASE == 1
70
     , useQR)
71 72
#endif
#if COMPLEXCASE == 1
73
     )
74
#endif
75

76
  !-------------------------------------------------------------------------------
77
  !  bandred_real/complex: Reduces a distributed symmetric matrix to band form
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
  !
  !  Parameters
  !
  !  na          Order of matrix
  !
  !  a(lda,matrixCols)    Distributed matrix which should be reduced.
  !              Distribution is like in Scalapack.
  !              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
  !              a(:,:) is overwritten on exit with the band and the Householder vectors
  !              in the upper half.
  !
  !  lda         Leading dimension of a
  !  matrixCols  local columns of matrix a
  !
  !  nblk        blocksize of cyclic distribution, must be the same in both directions!
  !
  !  nbw         semi bandwith of output matrix
  !
  !  mpi_comm_rows
  !  mpi_comm_cols
  !              MPI-Communicators for rows/columns
  !
  !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
  !              Factors for the Householder vectors (returned), needed for back transformation
  !
  !-------------------------------------------------------------------------------

      use cuda_functions
      use iso_c_binding
      use elpa1_compute
#ifdef WITH_OPENMP
      use omp_lib
#endif
      use precision
112
      use elpa_abstract_impl
113
      implicit none
114
#include "../general/precision_kinds.F90"
115
      class(elpa_abstract_impl_t), intent(inout) :: obj
116 117 118
      integer(kind=ik)                            :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols

#ifdef USE_ASSUMED_SIZE
119
      MATH_DATATYPE(kind=rck)                    :: a(lda,*), tmat(nbw,nbw,*)
120
#else
121
      MATH_DATATYPE(kind=rck)                    :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
122
#endif
Andreas Marek's avatar
Andreas Marek committed
123 124

#if REALCASE == 1
125
      real(kind=rk)                               :: eps
126 127
#endif
      logical, intent(in)                         :: useGPU
128

129 130 131
      integer(kind=ik)                            :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)                            :: l_cols, l_rows
#if REALCASE == 1
132
      integer(kind=ik)                            :: vmrCols
133
#endif
Andreas Marek's avatar
Andreas Marek committed
134 135 136 137
#ifdef WITH_OPENMP
      integer(kind=ik)                            :: mynlc, lrs, transformChunkSize
#endif
      integer(kind=ik)                            :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
138 139
      integer(kind=ik)                            :: istep, ncol, lch, lcx, nlc
      integer(kind=ik)                            :: tile_size, l_rows_tile, l_cols_tile
140

141 142
      real(kind=rk)                    :: vnorm2
      MATH_DATATYPE(kind=rck)                    :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
143

144
!      complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
145 146 147
      MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:),  vmrCUDA(:),  umcCUDA(:)
      MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
      MATH_DATATYPE(kind=rck), allocatable :: vr(:)
148 149 150 151

#if REALCASE == 1
      ! needed for blocked QR decomposition
      integer(kind=ik)                            :: PQRPARAM(11), work_size
152 153
      real(kind=rk)                    :: dwork_size(1)
      real(kind=rk), allocatable       :: work_blocked(:), tauvector(:), blockheuristic(:)
154
#endif
155
      ! a_dev is passed from bandred_real to trans_ev_band
156
      integer(kind=C_intptr_T)                    :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
157
#ifdef WITH_MPI
158 159 160 161
      integer(kind=ik), external                  :: numroc
#endif
      integer(kind=ik)                            :: ierr
      integer(kind=ik)                            :: cur_l_rows, cur_l_cols, vmr_size, umc_size
Andreas Marek's avatar
Andreas Marek committed
162
      integer(kind=c_intptr_t)                    :: lc_start, lc_end
163
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
164
      integer(kind=c_intptr_t)                    :: lce_1, lcs_1, lre_1
165 166 167 168 169
#endif
      integer(kind=ik)                            :: lr_end
      integer(kind=ik)                            :: na_cols
#if COMPLEXCASE == 1
      integer(kind=ik)                            :: na_rows
170 171
#endif

172 173 174 175 176
      logical, intent(in)                         :: wantDebug
      logical, intent(out)                        :: success
      logical                                     :: successCUDA
      integer(kind=ik)                            :: istat
      character(200)                              :: errorMessage
177

178 179 180 181
#if REALCASE == 1
      logical, intent(in)                         :: useQR
#endif
      integer(kind=ik)                            :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
Andreas Marek's avatar
Andreas Marek committed
182
                                                    ii, pp, transformChunkSize
183
      integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
184 185 186
                                                                        &PRECISION&
                                                                        &_&
                                                                        &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
187

Andreas Marek's avatar
Andreas Marek committed
188 189 190
      logical                                     :: useGPU_reduction_lower_block_to_tridiagonal


191
      call obj%timer%start("bandred_&
Andreas Marek's avatar
Andreas Marek committed
192 193 194 195
      &MATH_DATATYPE&
      &" // &
      &PRECISION_SUFFIX &
      )
Andreas Marek's avatar
Andreas Marek committed
196 197 198 199 200 201 202 203 204 205 206 207

      if (useGPU) then
        useGPU_reduction_lower_block_to_tridiagonal = .true.
#if REALCASE == 1
        if (useQR) then
          !in this case switch off GPU usage for step "reduce current block to lower triangular form"
          ! since this is done by QR decomposition
          useGPU_reduction_lower_block_to_tridiagonal = .false.
        endif
#endif
      endif

208
      if (wantDebug) call obj%timer%start("mpi_communication")
209 210 211 212 213

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
Andreas Marek's avatar
Andreas Marek committed
214

215
      if (wantDebug) call obj%timer%stop("mpi_communication")
216 217 218 219 220 221 222
      success = .true.


      ! Semibandwith nbw must be a multiple of blocksize nblk
      if (mod(nbw,nblk)/=0) then
        if (my_prow==0 .and. my_pcol==0) then
          if (wantDebug) then
Andreas Marek's avatar
Andreas Marek committed
223
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Andreas Marek committed
224 225
                                 &MATH_DATATYPE&
                                 &: ERROR: nbw=',nbw,', nblk=',nblk
Andreas Marek's avatar
Andreas Marek committed
226
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Andreas Marek committed
227 228
                                 &MATH_DATATYPE&
                                 &: ELPA2 works only for nbw==n*nblk'
229 230 231 232 233 234
          endif
          success = .false.
          return
        endif
      endif

Andreas Marek's avatar
Andreas Marek committed
235
      ! na_rows in used nowhere; only na_cols
236 237
      if (useGPU) then
#ifdef WITH_MPI
238
#if COMPLEXCASE == 1
239
        na_rows = numroc(na, nblk, my_prow, 0, np_rows)
240
#endif
241 242
        na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
243 244 245
#if COMPLEXCASE == 1
         na_rows = na
#endif
246
        na_cols = na
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
247
#endif /* WITH_MPI */
248 249

        ! Here we convert the regular host array into a pinned host array
250
        successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype)
251
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
252
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
253 254
                  &MATH_DATATYPE&
                  &: error in cudaMalloc a_dev 1"
Andreas Marek's avatar
Andreas Marek committed
255
          stop 1
256 257
        endif

258
        successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
259
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
260
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
261 262
                  &MATH_DATATYPE&
                  &: error in cudaMalloc tmat_dev 1"
Andreas Marek's avatar
Andreas Marek committed
263
          stop 1
264 265
        endif

266
        successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
267
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
268
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
269 270
                  &MATH_DATATYPE&
                  &: error in cudaMalloc vav_dev 1"
Andreas Marek's avatar
Andreas Marek committed
271
          stop 1
272
        endif
273 274 275 276 277 278 279 280 281 282
      endif ! useGPU

      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

      l_rows_tile = tile_size/np_rows ! local rows of a tile
      l_cols_tile = tile_size/np_cols ! local cols of a tile

283
#if REALCASE == 1
284 285 286
      if (useQR) then

        if (which_qr_decomposition == 1) then
287
          call qr_pqrparam_init(obj,pqrparam(1:11),    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
288 289 290
          allocate(tauvector(na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating tauvector "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
291
            stop 1
292 293 294 295 296
          endif

          allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating blockheuristic "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
297
            stop 1
298 299 300 301 302 303
          endif

          l_rows = local_index(na, my_prow, np_rows, nblk, -1)
          allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
304
            stop 1
305 306 307 308 309
          endif

          vmrCols = na

#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
310
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
311 312
               &PRECISION&
               &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
313 314 315 316
                                 nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
317
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
318 319
               &PRECISION&
               &(obj, a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
320 321 322 323 324
                                 vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
                                 nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)
#endif

325
          work_size = int(dwork_size(1))
326 327 328
          allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating work_blocked "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
329
            stop 1
330
          endif
Pavel Kus's avatar
Pavel Kus committed
331
          work_blocked = 0.0_rk
332 333 334
          deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
335
            stop 1
336 337 338 339 340
          endif

        endif ! which_qr_decomposition

      endif ! useQr
Andreas Marek's avatar
Andreas Marek committed
341
#endif /* REALCASE */
342 343 344 345 346

      if (useGPU) then

        cur_l_rows = 0
        cur_l_cols = 0
Andreas Marek's avatar
Andreas Marek committed
347

348
        successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
349
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
350
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
351 352
                  &MATH_DATATYPE&
                  &: error in cudaMemcpy a_dev 2"
Andreas Marek's avatar
Andreas Marek committed
353
          stop 1
354
        endif
355 356 357 358 359 360 361 362 363 364 365
      endif ! useGPU


      do istep = (na-1)/nbw, 1, -1

        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step

        ! Number of local columns/rows of remaining matrix
        l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
        l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)

366 367
        ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces

368 369 370 371 372 373 374 375 376 377 378 379 380
        if (useGPU) then
          cur_l_rows = max(l_rows, 1)
          cur_l_cols = max(l_cols, 1)

          vmr_size = cur_l_rows * 2 * n_cols
          umc_size = cur_l_cols * 2 * n_cols

          ! Allocate vmr and umc only if the inew size exceeds their current capacity
          ! Added for FORTRAN CALLS
          if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
            if (allocated(vr)) then
              deallocate(vr, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
381
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
382 383
                        &MATH_DATATYPE&
                        &: error when deallocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
384
                stop 1
385 386 387 388
              endif
            endif
            allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
            if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
389
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
390 391
                      &MATH_DATATYPE&
                      &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
392
              stop 1
393 394 395 396 397 398 399 400
            endif

          endif

          if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then
            if (allocated(vmrCUDA)) then
              deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
401
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
402 403
                        &MATH_DATATYPE&
                        &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
404
                stop 1
405 406 407 408
              endif

              successCUDA = cuda_free(vmr_dev)
              if (.not.(successCUDA)) then
409
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
410
                        &MATH_DATATYPE&: error in cuda_free vmr_dev 1"
Andreas Marek's avatar
Andreas Marek committed
411
                stop 1
412 413 414 415
              endif
            endif

            allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage)
416

417
            if (istat .ne. 0) then
418
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
419 420
                      &MATH_DATATYPE&
                      &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
421
              stop 1
422
            endif
423
            successCUDA = cuda_malloc(vmr_dev, vmr_size* size_of_datatype)
424
            if (.not.(successCUDA)) then
425
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
426 427
                      &MATH_DATATYPE&
                      &: error in cudaMalloc: vmr_dev2"
Andreas Marek's avatar
Andreas Marek committed
428
              stop 1
429 430 431
            endif

          endif
432 433 434 435 436

          if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then
            if (allocated(umcCUDA)) then
              deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
437
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
438 439
                        &MATH_DATATYPE&
                        &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
440
                stop 1
441 442 443 444
              endif

              successCUDA = cuda_free(umc_dev)
              if (.not.(successCUDA)) then
445
                 print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
446 447
                         &MATH_DATATYPE&
                         &: error in cudaFree umc_dev 1"
Andreas Marek's avatar
Andreas Marek committed
448
                 stop 1
449 450 451
              endif

            endif
452

453
            allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
454

455
            if (istat .ne. 0) then
456
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
457 458
                      &MATH_DATATYPE&
                      &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
459
              stop 1
460 461
            endif

462
            successCUDA = cuda_malloc(umc_dev, umc_size* size_of_datatype)
463
            if (.not.(successCUDA)) then
464
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
465 466
                      &MATH_DATATYPE&
                      &: error in cudaMalloc umc_dev 2"
Andreas Marek's avatar
Andreas Marek committed
467
              stop 1
468
            endif
469

470
          endif
471 472 473

        else ! GPU not used

474 475 476
          ! unify the the name vmr and vmrCPU, as well as vmrGPU
          ! the same for umcCPU and umcGPU
          ! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces
477 478 479

          allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
480
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
481 482
                     &MATH_DATATYPE&
                     &: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
483
            stop 1
484 485
          endif

486 487
          allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
488
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
489 490
                    &MATH_DATATYPE&
                    &: error when allocating umcCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
491
            stop 1
492
          endif
Andreas Marek's avatar
Andreas Marek committed
493

494 495
          allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
496
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
497 498
                    &MATH_DATATYPE&
                    &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
499
            stop 1
500 501 502 503 504
          endif

        endif ! use GPU

        if (useGPU) then
Pavel Kus's avatar
Pavel Kus committed
505
          vmrCUDA(1 : cur_l_rows * n_cols) = 0.0_rck
506
        else
Pavel Kus's avatar
Pavel Kus committed
507
          vmrCPU(1:l_rows,1:n_cols) = 0.0_rck
508 509
        endif ! useGPU

Pavel Kus's avatar
Pavel Kus committed
510 511
        vr(:) = 0.0_rck
        tmat(:,:,istep) = 0.0_rck
512
        if (useGPU) then
513
#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
514
          umcCUDA(1 : umc_size) = 0.0_rck
515
#endif
516 517 518 519
          lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
          lc_end   = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
          lr_end   = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)

520
          if (lc_start .le. 0) lc_start = 1
521 522 523 524

          ! Here we assume that the processor grid and the block grid are aligned
          cur_pcol = pcol(istep*nbw+1, nblk, np_cols)

525 526
          if (my_pcol == cur_pcol) then
            successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), &
Andreas Marek's avatar
Retab  
Andreas Marek committed
527
                                      int((lda*size_of_datatype),kind=c_intptr_t), &
528 529 530 531
                                            (a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )),      &
                                            int(lda*size_of_datatype,kind=c_intptr_t),              &
                                            int(lr_end*size_of_datatype,kind=c_intptr_t),           &
                                             int((lc_end - lc_start+1),kind=c_intptr_t),int(cudaMemcpyDeviceToHost,kind=c_int))
532

533

534
            if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
535
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
536 537
                      &MATH_DATATYPE&
                      &: error in cudaMemcpy2d"
Andreas Marek's avatar
Andreas Marek committed
538
              stop 1
539 540 541 542 543
            endif
          endif
        endif ! useGPU

        ! Reduce current block to lower triangular form
544
#if REALCASE == 1
545
        if (useQR) then
Andreas Marek's avatar
Andreas Marek committed
546 547 548 549
          if (useGPU) then
 !            vmrCPU(1:cur_l_rows,1:n_cols) = vmrCUDA(1 : cur_l_rows * n_cols)
          endif

550 551 552
          if (which_qr_decomposition == 1) then
            vmrCols = 2*n_cols
#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
553
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
554 555
                 &PRECISION&
                 &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
556 557 558 559 560 561 562
                                   na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size,        &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
563
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
564 565
                 &PRECISION&
                 &(obj, a(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) ,   &
566 567 568 569 570 571 572 573 574 575
                                    max(l_rows,1), vmrCols, tauvector(1:na), na, &
                                     tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)
#endif
          endif

       else !useQR
576
#endif /* REALCASE == 1 */
577 578
         do lc = n_cols, 1, -1

579
           ncol = istep*nbw + lc ! absolute column number of householder Vector
580 581 582 583 584 585 586 587 588 589 590 591 592
           nrow = ncol - nbw ! Absolute number of pivot row

           lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
           lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number

           tau = 0

           if (nrow == 1) exit ! Nothing to do

           cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block

           if (my_pcol==cur_pcol) then

593
             ! Get Vector to be transformed; distribute last element and norm of
594 595
             ! remaining elements to all procs in current column

596
             vr(1:lr) = a(1:lr,lch) ! Vector to be transformed
597 598 599 600 601 602

             if (my_prow==prow(nrow, nblk, np_rows)) then
               aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
               aux1(2) = vr(lr)
             else
               aux1(1) = dot_product(vr(1:lr),vr(1:lr))
Pavel Kus's avatar
Pavel Kus committed
603
               aux1(2) = 0.0_rck
604 605 606
             endif

#ifdef WITH_MPI
607
             if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
608
             call mpi_allreduce(aux1, aux2, 2, MPI_MATH_DATATYPE_PRECISION, &
Andreas Marek's avatar
Andreas Marek committed
609
                                MPI_SUM, mpi_comm_rows, mpierr)
610
             if (wantDebug) call obj%timer%stop("mpi_communication")
611 612 613

#else /* WITH_MPI */
              aux2 = aux1 ! this should be optimized
Andreas Marek's avatar
Andreas Marek committed
614
#endif
615

616
#if REALCASE == 1
617
             vnorm2 = aux2(1)
618 619 620 621
#endif
#if COMPLEXCASE == 1
             vnorm2 = real(aux2(1),kind=rk)
#endif
622 623 624
             vrl    = aux2(2)

             ! Householder transformation
Pavel Kus's avatar
Pavel Kus committed
625 626 627
       call hh_transform_&
             &MATH_DATATYPE&
             &_&
Andreas Marek's avatar
Andreas Marek committed
628
             &PRECISION &
Pavel Kus's avatar
Pavel Kus committed
629
                         (obj, vrl, vnorm2, xf, tau, wantDebug)
630
             ! Scale vr and store Householder Vector for back transformation
631 632 633 634 635

             vr(1:lr) = vr(1:lr) * xf
             if (my_prow==prow(nrow, nblk, np_rows)) then
               a(1:lr-1,lch) = vr(1:lr-1)
               a(lr,lch) = vrl
Pavel Kus's avatar
Pavel Kus committed
636
               vr(lr) = 1.0_rck
637 638 639 640 641 642
             else
               a(1:lr,lch) = vr(1:lr)
             endif

           endif

643
           ! Broadcast Householder Vector and tau along columns
644 645 646

           vr(lr+1) = tau
#ifdef WITH_MPI
647
           if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
648
           call MPI_Bcast(vr, lr+1, MPI_MATH_DATATYPE_PRECISION, &
Andreas Marek's avatar
Andreas Marek committed
649
                          cur_pcol, mpi_comm_cols, mpierr)
650
           if (wantDebug) call obj%timer%stop("mpi_communication")
651 652

#endif /* WITH_MPI */
653

Andreas Marek's avatar
Andreas Marek committed
654
           if (useGPU_reduction_lower_block_to_tridiagonal) then
655
             vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
656 657 658
           else
             vmrCPU(1:lr,lc) = vr(1:lr)
           endif
659 660
           tau = vr(lr+1)

661 662 663 664 665 666
#if REALCASE == 1
           tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
#endif
#if COMPLEXCASE == 1
           tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
#endif
667
           ! Transform remaining columns in current block with Householder Vector
668 669
           ! Local dot product

Pavel Kus's avatar
Pavel Kus committed
670
           aux1 = 0.0_rck
671 672

#ifdef WITH_OPENMP
673 674 675
#if 0
 ! original complex implementation without openmp. check performance
            nlc = 0 ! number of local columns
676 677 678 679 680 681 682 683 684 685
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
686
           if (wantDebug) call obj%timer%start("mpi_communication")
687 688 689 690 691 692 693 694 695 696
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
Andreas Marek's avatar
Andreas Marek committed
697

698 699 700
             endif
           enddo

701

702
           if (wantDebug) call obj%timer%stop("mpi_communication")
703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727

#else /* WITH_MPI */
!          if (nlc>0) aux2=aux1

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
             endif
           enddo

#endif /* WITH_MPI */
!
!           ! Transform
!
!           nlc = 0
!           do j=1,lc-1
!             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
!             if (lcx>0) then
!               nlc = nlc+1
!               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
728

729 730
!             endif
!           enddo
731
#endif /* if 0 */
732

733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759
           !Open up one omp region to avoid paying openmp overhead.
           !This does not help performance due to the addition of two openmp barriers around the MPI call,
           !But in the future this may be beneficial if these barriers are replaced with a faster implementation

           !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
           mynlc = 0 ! number of local columns

           !This loop does not have independent iterations,
           !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
           !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
           !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
           !the iteration number modulo the number of threads
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0 ) then
               mynlc = mynlc+1
               if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
                   if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
               endif
             endif
           enddo

           ! Get global dot products

           !$omp barrier
           !$omp single
#ifdef WITH_MPI
760
           if (wantDebug) call obj%timer%start("mpi_communication")
761
           if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, MPI_MATH_DATATYPE_PRECISION, &
Pavel Kus's avatar
Pavel Kus committed
762
                                           MPI_SUM, mpi_comm_rows, mpierr)
763
           if (wantDebug) call obj%timer%stop("mpi_communication")
764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792
#else /* WITH_MPI */
           if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
           !$omp end single
           !$omp barrier

           ! Transform
           transformChunkSize=32
           mynlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               mynlc = mynlc+1
               !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
               !However, for some reason this is slower than doing it manually, so it is parallelized as below.
               do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
                  do pp = 1,transformChunkSize
                      if (pp + ii > lr) exit
#if REALCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
#endif
#if COMPLEXCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp)
#endif
                  enddo
               enddo
             endif
           enddo
           !$omp end parallel
793 794 795 796 797 798 799 800 801 802 803 804 805 806

#else /* WITH_OPENMP */

           nlc = 0 ! number of local columns
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
807
           if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
808 809
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_MATH_DATATYPE_PRECISION, &
                                         MPI_SUM, mpi_comm_rows, mpierr)
810
           if (wantDebug) call obj%timer%stop("mpi_communication")
811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829
#else /* WITH_MPI */
           if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
#if REALCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
#endif
#if COMPLEXCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
#endif
             endif
           enddo
#endif /* WITH_OPENMP */
830 831
         enddo ! lc

Andreas Marek's avatar
Andreas Marek committed
832
         if (useGPU_reduction_lower_block_to_tridiagonal) then
833 834 835
           ! store column tiles back to GPU
           cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
           if (my_pcol == cur_pcol) then
836
             successCUDA = cuda_memcpy2d((a_dev+        &
837 838 839 840 841
                                         int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)),    &
                                         int(lda*size_of_datatype,kind=c_intptr_t), loc(a(1,lc_start)), &
                                         int(lda*size_of_datatype,kind=c_intptr_t),           &
                                         int(lr_end*size_of_datatype,kind=c_intptr_t),        &
                                         int((lc_end - lc_start+1),kind=c_intptr_t), &
842
                                         int(cudaMemcpyHostToDevice,kind=c_int))
843

844
             if (.not.(successCUDA)) then
845
               print *, "bandred_&
846 847
                        &MATH_DATATYPE&
                        &: cuda memcpy a_dev  failed ", istat
Andreas Marek's avatar
Andreas Marek committed
848
               stop 1
849
             endif
850 851 852 853 854 855 856
           endif
         endif

         ! Calculate scalar products of stored Householder vectors.
         ! This can be done in different ways, we use dsyrk

         vav = 0
857
         call obj%timer%start("blas")
Andreas Marek's avatar
Andreas Marek committed
858
         if (useGPU_reduction_lower_block_to_tridiagonal) then
859
           if (l_rows>0) &
Andreas Marek's avatar
Andreas Marek committed
860 861 862 863 864 865
#if REALCASE == 1
             call PRECISION_SYRK('U', 'T',            &
#endif
#if COMPLEXCASE == 1
             call PRECISION_HERK('U', 'C',            &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
866
                           n_cols, l_rows, ONE, &
867 868 869
                           vmrCUDA, cur_l_rows, &
                           ZERO, vav, ubound(vav,dim=1))

Andreas Marek's avatar
Andreas Marek committed
870
         else ! useGPU_reduction_to_tridiagonal
871
           if (l_rows>0) &
Andreas Marek's avatar
Andreas Marek committed
872 873
#if REALCASE == 1
             call PRECISION_SYRK('U', 'T',           &
874 875
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
876
             call PRECISION_HERK('U', 'C',           &
877
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
878
                           n_cols, l_rows, ONE, vmrCPU, ubound(vmrCPU,dim=1), ZERO, vav, ubound(vav,dim=1))
Andreas Marek's avatar
Andreas Marek committed
879
         endif
880
         call obj%timer%stop("blas")
881
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
882
         call symm_matrix_allreduce_&
883 884
#endif
#if COMPLEXCASE == 1
885
         call herm_matrix_allreduce_&
886
#endif
Andreas Marek's avatar
Andreas Marek committed
887
         &PRECISION &
888
                         (obj, n_cols,vav, nbw, nbw,mpi_comm_rows)
889
         ! Calculate triangular matrix T for block Householder Transformation
890
         call obj%timer%start("blas")
891 892 893
         do lc=n_cols,1,-1
           tau = tmat(lc,lc,istep)
           if (lc<n_cols) then
Pavel Kus's avatar
Pavel Kus committed
894
             call PRECISION_TRMV('U', BLAS_TRANS_OR_CONJ, 'N',&
Andreas Marek's avatar