elpa2_bandred_template.F90 59.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
63 64 65 66
    subroutine bandred_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
67
    (obj, na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
Andreas Marek's avatar
Andreas Marek committed
68
     tmat_dev, wantDebug, useGPU, success &
69
#if REALCASE == 1
70
     , useQR)
71 72
#endif
#if COMPLEXCASE == 1
73
     )
74
#endif
75

76
  !-------------------------------------------------------------------------------
77
  !  bandred_real/complex: Reduces a distributed symmetric matrix to band form
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
  !
  !  Parameters
  !
  !  na          Order of matrix
  !
  !  a(lda,matrixCols)    Distributed matrix which should be reduced.
  !              Distribution is like in Scalapack.
  !              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
  !              a(:,:) is overwritten on exit with the band and the Householder vectors
  !              in the upper half.
  !
  !  lda         Leading dimension of a
  !  matrixCols  local columns of matrix a
  !
  !  nblk        blocksize of cyclic distribution, must be the same in both directions!
  !
  !  nbw         semi bandwith of output matrix
  !
  !  mpi_comm_rows
  !  mpi_comm_cols
  !              MPI-Communicators for rows/columns
  !
  !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
  !              Factors for the Householder vectors (returned), needed for back transformation
  !
  !-------------------------------------------------------------------------------

      use cuda_functions
      use iso_c_binding
      use elpa1_compute
#ifdef WITH_OPENMP
      use omp_lib
#endif
      use precision
112
      use elpa_abstract_impl
113
      implicit none
114
#include "../general/precision_kinds.F90"
115
      class(elpa_abstract_impl_t), intent(inout) :: obj
116 117 118
      integer(kind=ik)                            :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols

#ifdef USE_ASSUMED_SIZE
119
      MATH_DATATYPE(kind=rck)                    :: a(lda,*), tmat(nbw,nbw,*)
120
#else
121
      MATH_DATATYPE(kind=rck)                    :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
122
#endif
Andreas Marek's avatar
Andreas Marek committed
123 124

#if REALCASE == 1
125
      real(kind=rk)                               :: eps
126 127
#endif
      logical, intent(in)                         :: useGPU
128

129 130 131
      integer(kind=ik)                            :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)                            :: l_cols, l_rows
#if REALCASE == 1
132
      integer(kind=ik)                            :: vmrCols
133
#endif
Andreas Marek's avatar
Andreas Marek committed
134 135 136 137
#ifdef WITH_OPENMP
      integer(kind=ik)                            :: mynlc, lrs, transformChunkSize
#endif
      integer(kind=ik)                            :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
138 139
      integer(kind=ik)                            :: istep, ncol, lch, lcx, nlc
      integer(kind=ik)                            :: tile_size, l_rows_tile, l_cols_tile
140

141 142
      real(kind=rk)                    :: vnorm2
      MATH_DATATYPE(kind=rck)                    :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
143

144
!      complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
145 146 147
      MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:),  vmrCUDA(:),  umcCUDA(:)
      MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
      MATH_DATATYPE(kind=rck), allocatable :: vr(:)
148 149 150 151

#if REALCASE == 1
      ! needed for blocked QR decomposition
      integer(kind=ik)                            :: PQRPARAM(11), work_size
152 153
      real(kind=rk)                    :: dwork_size(1)
      real(kind=rk), allocatable       :: work_blocked(:), tauvector(:), blockheuristic(:)
154
#endif
155
      ! a_dev is passed from bandred_real to trans_ev_band
156
      integer(kind=C_intptr_T)                    :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
157
#ifdef WITH_MPI
158 159 160 161
      integer(kind=ik), external                  :: numroc
#endif
      integer(kind=ik)                            :: ierr
      integer(kind=ik)                            :: cur_l_rows, cur_l_cols, vmr_size, umc_size
162
      integer(kind=c_intptr_t)                      :: lc_start, lc_end
163
#if COMPLEXCASE == 1
164
      integer(kind=c_intptr_t)                      :: lce_1, lcs_1, lre_1
165 166 167 168 169
#endif
      integer(kind=ik)                            :: lr_end
      integer(kind=ik)                            :: na_cols
#if COMPLEXCASE == 1
      integer(kind=ik)                            :: na_rows
170 171
#endif

172 173 174 175 176
      logical, intent(in)                         :: wantDebug
      logical, intent(out)                        :: success
      logical                                     :: successCUDA
      integer(kind=ik)                            :: istat
      character(200)                              :: errorMessage
177

178 179 180 181
#if REALCASE == 1
      logical, intent(in)                         :: useQR
#endif
      integer(kind=ik)                            :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
Andreas Marek's avatar
Andreas Marek committed
182
                                                    ii, pp
183
      integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
184 185 186
                                                                        &PRECISION&
                                                                        &_&
                                                                        &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
187

188
      call obj%timer%start("bandred_&
Andreas Marek's avatar
Andreas Marek committed
189 190 191 192
      &MATH_DATATYPE&
      &" // &
      &PRECISION_SUFFIX &
      )
193
      if (wantDebug) call obj%timer%start("mpi_communication")
194 195 196 197 198

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
Andreas Marek's avatar
Andreas Marek committed
199

200
      if (wantDebug) call obj%timer%stop("mpi_communication")
201 202 203 204 205 206 207
      success = .true.


      ! Semibandwith nbw must be a multiple of blocksize nblk
      if (mod(nbw,nblk)/=0) then
        if (my_prow==0 .and. my_pcol==0) then
          if (wantDebug) then
Andreas Marek's avatar
Andreas Marek committed
208
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
209 210
      &MATH_DATATYPE&
      &: ERROR: nbw=',nbw,', nblk=',nblk
Andreas Marek's avatar
Andreas Marek committed
211
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
212 213
      &MATH_DATATYPE&
      &: ELPA2 works only for nbw==n*nblk'
214 215 216 217 218 219 220 221 222
          endif
          success = .false.
          return
        endif
      endif

! na_rows in used nowhere; only na_cols
      if (useGPU) then
#ifdef WITH_MPI
223
#if COMPLEXCASE == 1
224
        na_rows = numroc(na, nblk, my_prow, 0, np_rows)
225
#endif
226 227
        na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
228 229 230
#if COMPLEXCASE == 1
         na_rows = na
#endif
231
        na_cols = na
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
232
#endif /* WITH_MPI */
233 234

        ! Here we convert the regular host array into a pinned host array
235
        successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype)
236
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
237
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
238 239
                  &MATH_DATATYPE&
                  &: error in cudaMalloc a_dev 1"
Andreas Marek's avatar
Andreas Marek committed
240
          stop 1
241 242
        endif

243
        successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
244
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
245
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
246 247
                  &MATH_DATATYPE&
                  &: error in cudaMalloc tmat_dev 1"
Andreas Marek's avatar
Andreas Marek committed
248
          stop 1
249 250
        endif

251
        successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
252
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
253
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
254 255
                  &MATH_DATATYPE&
                  &: error in cudaMalloc vav_dev 1"
Andreas Marek's avatar
Andreas Marek committed
256
          stop 1
257
        endif
258 259 260 261 262 263 264 265 266 267
      endif ! useGPU

      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

      l_rows_tile = tile_size/np_rows ! local rows of a tile
      l_cols_tile = tile_size/np_cols ! local cols of a tile

268
#if REALCASE == 1
269 270 271 272
      if (useQR) then

        if (useGPU) then
          print *,"qr decomposition at the moment not supported with GPU"
Andreas Marek's avatar
Andreas Marek committed
273
          stop 1
274 275 276
        endif

        if (which_qr_decomposition == 1) then
277
          call qr_pqrparam_init(obj,pqrparam(1:11),    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
278 279 280
          allocate(tauvector(na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating tauvector "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
281
            stop 1
282 283 284 285 286
          endif

          allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating blockheuristic "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
287
            stop 1
288 289 290 291 292 293
          endif

          l_rows = local_index(na, my_prow, np_rows, nblk, -1)
          allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
294
            stop 1
295 296 297 298 299
          endif

          vmrCols = na

#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
300
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
301 302
               &PRECISION&
               &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
303 304 305 306
                                 nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
307
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
308 309
               &PRECISION&
               &(obj, a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
310 311 312 313 314
                                 vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
                                 nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)
#endif

315
          work_size = int(dwork_size(1))
316 317 318
          allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating work_blocked "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
319
            stop 1
320
          endif
Pavel Kus's avatar
Pavel Kus committed
321
          work_blocked = 0.0_rk
322 323 324
          deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
325
            stop 1
326 327 328 329 330
          endif

        endif ! which_qr_decomposition

      endif ! useQr
Andreas Marek's avatar
Andreas Marek committed
331
#endif /* REALCASE */
332 333 334 335 336

      if (useGPU) then

        cur_l_rows = 0
        cur_l_cols = 0
Andreas Marek's avatar
Andreas Marek committed
337

338
        successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
339
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
340
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
341 342
                  &MATH_DATATYPE&
                  &: error in cudaMemcpy a_dev 2"
Andreas Marek's avatar
Andreas Marek committed
343
          stop 1
344
        endif
345 346 347 348 349 350 351 352 353 354 355
      endif ! useGPU


      do istep = (na-1)/nbw, 1, -1

        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step

        ! Number of local columns/rows of remaining matrix
        l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
        l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)

356 357
        ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces

358 359 360 361 362 363 364 365 366 367 368 369 370
        if (useGPU) then
          cur_l_rows = max(l_rows, 1)
          cur_l_cols = max(l_cols, 1)

          vmr_size = cur_l_rows * 2 * n_cols
          umc_size = cur_l_cols * 2 * n_cols

          ! Allocate vmr and umc only if the inew size exceeds their current capacity
          ! Added for FORTRAN CALLS
          if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
            if (allocated(vr)) then
              deallocate(vr, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
371
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
372 373
                        &MATH_DATATYPE&
                        &: error when deallocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
374
                stop 1
375 376 377 378
              endif
            endif
            allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
            if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
379
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
380 381
                      &MATH_DATATYPE&
                      &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
382
              stop 1
383 384 385 386 387 388 389 390
            endif

          endif

          if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then
            if (allocated(vmrCUDA)) then
              deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
391
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
392 393
                        &MATH_DATATYPE&
                        &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
394
                stop 1
395 396 397 398
              endif

              successCUDA = cuda_free(vmr_dev)
              if (.not.(successCUDA)) then
399
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
400
                        &MATH_DATATYPE&: error in cuda_free vmr_dev 1"
Andreas Marek's avatar
Andreas Marek committed
401
                stop 1
402 403 404 405
              endif
            endif

            allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage)
406

407
            if (istat .ne. 0) then
408
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
409 410
                      &MATH_DATATYPE&
                      &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
411
              stop 1
412
            endif
413
            successCUDA = cuda_malloc(vmr_dev, vmr_size* size_of_datatype)
414
            if (.not.(successCUDA)) then
415
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
416 417
                      &MATH_DATATYPE&
                      &: error in cudaMalloc: vmr_dev2"
Andreas Marek's avatar
Andreas Marek committed
418
              stop 1
419 420 421
            endif

          endif
422 423 424 425 426

          if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then
            if (allocated(umcCUDA)) then
              deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
427
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
428 429
                        &MATH_DATATYPE&
                        &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
430
                stop 1
431 432 433 434
              endif

              successCUDA = cuda_free(umc_dev)
              if (.not.(successCUDA)) then
435
                 print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
436 437
                         &MATH_DATATYPE&
                         &: error in cudaFree umc_dev 1"
Andreas Marek's avatar
Andreas Marek committed
438
                 stop 1
439 440 441
              endif

            endif
442

443
            allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
444

445
            if (istat .ne. 0) then
446
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
447 448
                      &MATH_DATATYPE&
                      &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
449
              stop 1
450 451
            endif

452
            successCUDA = cuda_malloc(umc_dev, umc_size* size_of_datatype)
453
            if (.not.(successCUDA)) then
454
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
455 456
                      &MATH_DATATYPE&
                      &: error in cudaMalloc umc_dev 2"
Andreas Marek's avatar
Andreas Marek committed
457
              stop 1
458
            endif
459

460
          endif
461 462 463

        else ! GPU not used

464 465 466
          ! unify the the name vmr and vmrCPU, as well as vmrGPU
          ! the same for umcCPU and umcGPU
          ! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces
467 468 469

          allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
470
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
471 472
                     &MATH_DATATYPE&
                     &: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
473
            stop 1
474 475
          endif

476 477
          allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
478
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
479 480
                    &MATH_DATATYPE&
                    &: error when allocating umcCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
481
            stop 1
482
          endif
Andreas Marek's avatar
Andreas Marek committed
483

484 485
          allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
486
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
487 488
                    &MATH_DATATYPE&
                    &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
489
            stop 1
490 491 492 493 494
          endif

        endif ! use GPU

        if (useGPU) then
Pavel Kus's avatar
Pavel Kus committed
495
          vmrCUDA(1 : cur_l_rows * n_cols) = 0.0_rck
496
        else
Pavel Kus's avatar
Pavel Kus committed
497
          vmrCPU(1:l_rows,1:n_cols) = 0.0_rck
498 499
        endif ! useGPU

Pavel Kus's avatar
Pavel Kus committed
500 501
        vr(:) = 0.0_rck
        tmat(:,:,istep) = 0.0_rck
502
        if (useGPU) then
503
#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
504
          umcCUDA(1 : umc_size) = 0.0_rck
505
#endif
506 507 508 509
          lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
          lc_end   = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
          lr_end   = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)

510
          if (lc_start .le. 0) lc_start = 1
511 512 513 514

          ! Here we assume that the processor grid and the block grid are aligned
          cur_pcol = pcol(istep*nbw+1, nblk, np_cols)

515 516
          if (my_pcol == cur_pcol) then
            successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), &
Andreas Marek's avatar
Retab  
Andreas Marek committed
517
                                      int((lda*size_of_datatype),kind=c_intptr_t), &
518 519 520 521
                                            (a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )),      &
                                            int(lda*size_of_datatype,kind=c_intptr_t),              &
                                            int(lr_end*size_of_datatype,kind=c_intptr_t),           &
                                             int((lc_end - lc_start+1),kind=c_intptr_t),int(cudaMemcpyDeviceToHost,kind=c_int))
522

523

524
            if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
525
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
526 527
                      &MATH_DATATYPE&
                      &: error in cudaMemcpy2d"
Andreas Marek's avatar
Andreas Marek committed
528
              stop 1
529 530 531 532 533
            endif
          endif
        endif ! useGPU

        ! Reduce current block to lower triangular form
534
#if REALCASE == 1
535 536 537 538
        if (useQR) then
          if (which_qr_decomposition == 1) then
            vmrCols = 2*n_cols
#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
539
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
540 541
                 &PRECISION&
                 &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
542 543 544 545 546 547 548
                                   na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size,        &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
549
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
550 551
                 &PRECISION&
                 &(obj, a(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) ,   &
552 553 554 555 556 557 558 559 560 561
                                    max(l_rows,1), vmrCols, tauvector(1:na), na, &
                                     tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)
#endif
          endif

       else !useQR
562
#endif /* REALCASE == 1 */
563 564
         do lc = n_cols, 1, -1

565
           ncol = istep*nbw + lc ! absolute column number of householder Vector
566 567 568 569 570 571 572 573 574 575 576 577 578
           nrow = ncol - nbw ! Absolute number of pivot row

           lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
           lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number

           tau = 0

           if (nrow == 1) exit ! Nothing to do

           cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block

           if (my_pcol==cur_pcol) then

579
             ! Get Vector to be transformed; distribute last element and norm of
580 581
             ! remaining elements to all procs in current column

582
             vr(1:lr) = a(1:lr,lch) ! Vector to be transformed
583 584 585 586 587 588

             if (my_prow==prow(nrow, nblk, np_rows)) then
               aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
               aux1(2) = vr(lr)
             else
               aux1(1) = dot_product(vr(1:lr),vr(1:lr))
Pavel Kus's avatar
Pavel Kus committed
589
               aux1(2) = 0.0_rck
590 591 592
             endif

#ifdef WITH_MPI
593
             if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
594
             call mpi_allreduce(aux1, aux2, 2, MPI_MATH_DATATYPE_PRECISION, &
Andreas Marek's avatar
Andreas Marek committed
595
                                MPI_SUM, mpi_comm_rows, mpierr)
596
             if (wantDebug) call obj%timer%stop("mpi_communication")
597 598 599

#else /* WITH_MPI */
              aux2 = aux1 ! this should be optimized
Andreas Marek's avatar
Andreas Marek committed
600
#endif
601

602
#if REALCASE == 1
603
             vnorm2 = aux2(1)
604 605 606 607
#endif
#if COMPLEXCASE == 1
             vnorm2 = real(aux2(1),kind=rk)
#endif
608 609 610
             vrl    = aux2(2)

             ! Householder transformation
Pavel Kus's avatar
Pavel Kus committed
611 612 613
       call hh_transform_&
             &MATH_DATATYPE&
             &_&
Andreas Marek's avatar
Andreas Marek committed
614
             &PRECISION &
Pavel Kus's avatar
Pavel Kus committed
615
                         (obj, vrl, vnorm2, xf, tau, wantDebug)
616
             ! Scale vr and store Householder Vector for back transformation
617 618 619 620 621

             vr(1:lr) = vr(1:lr) * xf
             if (my_prow==prow(nrow, nblk, np_rows)) then
               a(1:lr-1,lch) = vr(1:lr-1)
               a(lr,lch) = vrl
Pavel Kus's avatar
Pavel Kus committed
622
               vr(lr) = 1.0_rck
623 624 625 626 627 628
             else
               a(1:lr,lch) = vr(1:lr)
             endif

           endif

629
           ! Broadcast Householder Vector and tau along columns
630 631 632

           vr(lr+1) = tau
#ifdef WITH_MPI
633
           if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
634
           call MPI_Bcast(vr, lr+1, MPI_MATH_DATATYPE_PRECISION, &
Andreas Marek's avatar
Andreas Marek committed
635
                          cur_pcol, mpi_comm_cols, mpierr)
636
           if (wantDebug) call obj%timer%stop("mpi_communication")
637 638

#endif /* WITH_MPI */
639

640 641
           if (useGPU) then
             vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
642 643 644
           else
             vmrCPU(1:lr,lc) = vr(1:lr)
           endif
645 646
           tau = vr(lr+1)

647 648 649 650 651 652
#if REALCASE == 1
           tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
#endif
#if COMPLEXCASE == 1
           tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
#endif
653
           ! Transform remaining columns in current block with Householder Vector
654 655
           ! Local dot product

Pavel Kus's avatar
Pavel Kus committed
656
           aux1 = 0.0_rck
657 658

#ifdef WITH_OPENMP
659 660 661
#if 0
 ! original complex implementation without openmp. check performance
            nlc = 0 ! number of local columns
662 663 664 665 666 667 668 669 670 671
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
672
           if (wantDebug) call obj%timer%start("mpi_communication")
673 674 675 676 677 678 679 680 681 682
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
Andreas Marek's avatar
Andreas Marek committed
683

684 685 686
             endif
           enddo

687

688
           if (wantDebug) call obj%timer%stop("mpi_communication")
689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713

#else /* WITH_MPI */
!          if (nlc>0) aux2=aux1

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
             endif
           enddo

#endif /* WITH_MPI */
!
!           ! Transform
!
!           nlc = 0
!           do j=1,lc-1
!             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
!             if (lcx>0) then
!               nlc = nlc+1
!               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
714

715 716
!             endif
!           enddo
717
#endif /* if 0 */
718

719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745
           !Open up one omp region to avoid paying openmp overhead.
           !This does not help performance due to the addition of two openmp barriers around the MPI call,
           !But in the future this may be beneficial if these barriers are replaced with a faster implementation

           !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
           mynlc = 0 ! number of local columns

           !This loop does not have independent iterations,
           !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
           !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
           !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
           !the iteration number modulo the number of threads
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0 ) then
               mynlc = mynlc+1
               if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
                   if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
               endif
             endif
           enddo

           ! Get global dot products

           !$omp barrier
           !$omp single
#ifdef WITH_MPI
746
           if (wantDebug) call obj%timer%start("mpi_communication")
747
           if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, MPI_MATH_DATATYPE_PRECISION, &
Pavel Kus's avatar
Pavel Kus committed
748
                                           MPI_SUM, mpi_comm_rows, mpierr)
749
           if (wantDebug) call obj%timer%stop("mpi_communication")
750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778
#else /* WITH_MPI */
           if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
           !$omp end single
           !$omp barrier

           ! Transform
           transformChunkSize=32
           mynlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               mynlc = mynlc+1
               !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
               !However, for some reason this is slower than doing it manually, so it is parallelized as below.
               do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
                  do pp = 1,transformChunkSize
                      if (pp + ii > lr) exit
#if REALCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
#endif
#if COMPLEXCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp)
#endif
                  enddo
               enddo
             endif
           enddo
           !$omp end parallel
779 780 781 782 783 784 785 786 787 788 789 790 791 792

#else /* WITH_OPENMP */

           nlc = 0 ! number of local columns
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
793
           if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
794 795
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_MATH_DATATYPE_PRECISION, &
                                         MPI_SUM, mpi_comm_rows, mpierr)
796
           if (wantDebug) call obj%timer%stop("mpi_communication")
797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815
#else /* WITH_MPI */
           if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
#if REALCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
#endif
#if COMPLEXCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
#endif
             endif
           enddo
#endif /* WITH_OPENMP */
816 817 818 819 820 821
         enddo ! lc

         if (useGPU) then
           ! store column tiles back to GPU
           cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
           if (my_pcol == cur_pcol) then
822
             successCUDA = cuda_memcpy2d((a_dev+        &
823 824 825 826 827
                                         int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)),    &
                                         int(lda*size_of_datatype,kind=c_intptr_t), loc(a(1,lc_start)), &
                                         int(lda*size_of_datatype,kind=c_intptr_t),           &
                                         int(lr_end*size_of_datatype,kind=c_intptr_t),        &
                                         int((lc_end - lc_start+1),kind=c_intptr_t), &
828
                                         int(cudaMemcpyHostToDevice,kind=c_int))
829

830
             if (.not.(successCUDA)) then
831
               print *, "bandred_&
832 833
                        &MATH_DATATYPE&
                        &: cuda memcpy a_dev  failed ", istat
Andreas Marek's avatar
Andreas Marek committed
834
               stop 1
835
             endif
836 837 838 839 840 841 842
           endif
         endif

         ! Calculate scalar products of stored Householder vectors.
         ! This can be done in different ways, we use dsyrk

         vav = 0
843
         call obj%timer%start("blas")
844 845
         if (useGPU) then
           if (l_rows>0) &
Andreas Marek's avatar
Andreas Marek committed
846 847 848 849 850 851
#if REALCASE == 1
             call PRECISION_SYRK('U', 'T',            &
#endif
#if COMPLEXCASE == 1
             call PRECISION_HERK('U', 'C',            &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
852
                           n_cols, l_rows, ONE, &
853 854 855
                           vmrCUDA, cur_l_rows, &
                           ZERO, vav, ubound(vav,dim=1))

856
         else ! useGPU
857
           if (l_rows>0) &
Andreas Marek's avatar
Andreas Marek committed
858 859
#if REALCASE == 1
             call PRECISION_SYRK('U', 'T',           &
860 861
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
862
             call PRECISION_HERK('U', 'C',           &
863
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
864
                           n_cols, l_rows, ONE, vmrCPU, ubound(vmrCPU,dim=1), ZERO, vav, ubound(vav,dim=1))
Andreas Marek's avatar
Andreas Marek committed
865
         endif
866
         call obj%timer%stop("blas")
867
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
868
         call symm_matrix_allreduce_&
869 870
#endif
#if COMPLEXCASE == 1
871
         call herm_matrix_allreduce_&
872
#endif
Andreas Marek's avatar
Andreas Marek committed
873
         &PRECISION &
874
                         (obj, n_cols,vav, nbw, nbw,mpi_comm_rows)
875
         ! Calculate triangular matrix T for block Householder Transformation
876
         call obj%timer%start("blas")
877 878 879
         do lc=n_cols,1,-1
           tau = tmat(lc,lc,istep)
           if (lc<n_cols) then
Pavel Kus's avatar
Pavel Kus committed
880
             call PRECISION_TRMV('U', BLAS_TRANS_OR_CONJ, 'N',&
Andreas Marek's avatar
Andreas Marek committed
881
                                 n_cols-lc, tmat(lc+1,lc+1,istep), ubound(tmat,dim=1), vav(lc+1,lc), 1)
882 883

#if REALCASE == 1
884
             tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
885 886 887 888
#endif
#if COMPLEXCASE == 1
             tmat(lc,lc+1:n_cols,istep) = -tau * conjg(vav(lc+1:n_cols,lc))
#endif
889 890
           endif
         enddo
891
         call obj%timer%stop("blas")
892 893 894
#if REALCASE == 1
       endif !useQR
#endif