elpa2_bandred_template.F90 61.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
63 64 65 66
    subroutine bandred_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
67
    (obj, na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
Andreas Marek's avatar
Andreas Marek committed
68
     tmat_dev, wantDebug, useGPU, success &
69
#if REALCASE == 1
70
     , useQR)
71 72
#endif
#if COMPLEXCASE == 1
73
     )
74
#endif
75

76
  !-------------------------------------------------------------------------------
77
  !  bandred_real/complex: Reduces a distributed symmetric matrix to band form
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
  !
  !  Parameters
  !
  !  na          Order of matrix
  !
  !  a(lda,matrixCols)    Distributed matrix which should be reduced.
  !              Distribution is like in Scalapack.
  !              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
  !              a(:,:) is overwritten on exit with the band and the Householder vectors
  !              in the upper half.
  !
  !  lda         Leading dimension of a
  !  matrixCols  local columns of matrix a
  !
  !  nblk        blocksize of cyclic distribution, must be the same in both directions!
  !
  !  nbw         semi bandwith of output matrix
  !
  !  mpi_comm_rows
  !  mpi_comm_cols
  !              MPI-Communicators for rows/columns
  !
  !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
  !              Factors for the Householder vectors (returned), needed for back transformation
  !
  !-------------------------------------------------------------------------------

      use cuda_functions
      use iso_c_binding
      use elpa1_compute
#ifdef WITH_OPENMP
      use omp_lib
#endif
      use precision
112
      use elpa_abstract_impl
113
      implicit none
114
#include "../general/precision_kinds.F90"
115
      class(elpa_abstract_impl_t), intent(inout) :: obj
116 117 118
      integer(kind=ik)                            :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols

#ifdef USE_ASSUMED_SIZE
119
      MATH_DATATYPE(kind=rck)                    :: a(lda,*), tmat(nbw,nbw,*)
120
#else
121
      MATH_DATATYPE(kind=rck)                    :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
122
#endif
Andreas Marek's avatar
Andreas Marek committed
123 124

#if REALCASE == 1
125
      real(kind=rk)                               :: eps
126 127
#endif
      logical, intent(in)                         :: useGPU
128

129 130 131
      integer(kind=ik)                            :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)                            :: l_cols, l_rows
#if REALCASE == 1
132
      integer(kind=ik)                            :: vmrCols
133
#endif
Andreas Marek's avatar
Andreas Marek committed
134 135 136 137
#ifdef WITH_OPENMP
      integer(kind=ik)                            :: mynlc, lrs, transformChunkSize
#endif
      integer(kind=ik)                            :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
138 139
      integer(kind=ik)                            :: istep, ncol, lch, lcx, nlc
      integer(kind=ik)                            :: tile_size, l_rows_tile, l_cols_tile
140

141 142
      real(kind=rk)                    :: vnorm2
      MATH_DATATYPE(kind=rck)                    :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
143

144
!      complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
145 146 147
      MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:),  vmrCUDA(:),  umcCUDA(:)
      MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
      MATH_DATATYPE(kind=rck), allocatable :: vr(:)
148 149 150 151

#if REALCASE == 1
      ! needed for blocked QR decomposition
      integer(kind=ik)                            :: PQRPARAM(11), work_size
152 153
      real(kind=rk)                    :: dwork_size(1)
      real(kind=rk), allocatable       :: work_blocked(:), tauvector(:), blockheuristic(:)
154
#endif
155
      ! a_dev is passed from bandred_real to trans_ev_band
156
      integer(kind=C_intptr_T)                    :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
157
#ifdef WITH_MPI
158 159 160 161
      integer(kind=ik), external                  :: numroc
#endif
      integer(kind=ik)                            :: ierr
      integer(kind=ik)                            :: cur_l_rows, cur_l_cols, vmr_size, umc_size
Andreas Marek's avatar
Andreas Marek committed
162
      integer(kind=c_intptr_t)                    :: lc_start, lc_end
163
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
164
      integer(kind=c_intptr_t)                    :: lce_1, lcs_1, lre_1
165 166 167 168 169
#endif
      integer(kind=ik)                            :: lr_end
      integer(kind=ik)                            :: na_cols
#if COMPLEXCASE == 1
      integer(kind=ik)                            :: na_rows
170 171
#endif

172 173 174 175 176
      logical, intent(in)                         :: wantDebug
      logical, intent(out)                        :: success
      logical                                     :: successCUDA
      integer(kind=ik)                            :: istat
      character(200)                              :: errorMessage
177

178 179 180 181
#if REALCASE == 1
      logical, intent(in)                         :: useQR
#endif
      integer(kind=ik)                            :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
Andreas Marek's avatar
Andreas Marek committed
182
                                                    ii, pp, transformChunkSize
183
      integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
184 185 186
                                                                        &PRECISION&
                                                                        &_&
                                                                        &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
187

Andreas Marek's avatar
Andreas Marek committed
188 189 190
      logical                                     :: useGPU_reduction_lower_block_to_tridiagonal


191
      call obj%timer%start("bandred_&
Andreas Marek's avatar
Andreas Marek committed
192 193 194 195
      &MATH_DATATYPE&
      &" // &
      &PRECISION_SUFFIX &
      )
Andreas Marek's avatar
Andreas Marek committed
196
      useGPU_reduction_lower_block_to_tridiagonal = .false.
Andreas Marek's avatar
Andreas Marek committed
197 198 199 200 201 202 203 204 205 206 207 208

      if (useGPU) then
        useGPU_reduction_lower_block_to_tridiagonal = .true.
#if REALCASE == 1
        if (useQR) then
          !in this case switch off GPU usage for step "reduce current block to lower triangular form"
          ! since this is done by QR decomposition
          useGPU_reduction_lower_block_to_tridiagonal = .false.
        endif
#endif
      endif

209
      if (wantDebug) call obj%timer%start("mpi_communication")
210 211 212 213 214

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
Andreas Marek's avatar
Andreas Marek committed
215

216
      if (wantDebug) call obj%timer%stop("mpi_communication")
217 218 219 220 221 222 223
      success = .true.


      ! Semibandwith nbw must be a multiple of blocksize nblk
      if (mod(nbw,nblk)/=0) then
        if (my_prow==0 .and. my_pcol==0) then
          if (wantDebug) then
Andreas Marek's avatar
Andreas Marek committed
224
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Andreas Marek committed
225 226
                                 &MATH_DATATYPE&
                                 &: ERROR: nbw=',nbw,', nblk=',nblk
Andreas Marek's avatar
Andreas Marek committed
227
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Andreas Marek committed
228 229
                                 &MATH_DATATYPE&
                                 &: ELPA2 works only for nbw==n*nblk'
230 231 232 233 234 235
          endif
          success = .false.
          return
        endif
      endif

Andreas Marek's avatar
Andreas Marek committed
236
      ! na_rows in used nowhere; only na_cols
237 238
      if (useGPU) then
#ifdef WITH_MPI
239
#if COMPLEXCASE == 1
240
        na_rows = numroc(na, nblk, my_prow, 0, np_rows)
241
#endif
242 243
        na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
244 245 246
#if COMPLEXCASE == 1
         na_rows = na
#endif
247
        na_cols = na
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
248
#endif /* WITH_MPI */
249 250

        ! Here we convert the regular host array into a pinned host array
251
        successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype)
252
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
253
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
254 255
                  &MATH_DATATYPE&
                  &: error in cudaMalloc a_dev 1"
Andreas Marek's avatar
Andreas Marek committed
256
          stop 1
257 258
        endif

259
        successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
260
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
261
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
262 263
                  &MATH_DATATYPE&
                  &: error in cudaMalloc tmat_dev 1"
Andreas Marek's avatar
Andreas Marek committed
264
          stop 1
265 266
        endif

267
        successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
268
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
269
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
270 271
                  &MATH_DATATYPE&
                  &: error in cudaMalloc vav_dev 1"
Andreas Marek's avatar
Andreas Marek committed
272
          stop 1
273
        endif
274 275 276 277 278 279 280 281 282 283
      endif ! useGPU

      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

      l_rows_tile = tile_size/np_rows ! local rows of a tile
      l_cols_tile = tile_size/np_cols ! local cols of a tile

284
#if REALCASE == 1
285 286 287
      if (useQR) then

        if (which_qr_decomposition == 1) then
288
          call qr_pqrparam_init(obj,pqrparam(1:11),    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
289 290 291
          allocate(tauvector(na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating tauvector "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
292
            stop 1
293 294 295 296 297
          endif

          allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating blockheuristic "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
298
            stop 1
299 300 301 302 303 304
          endif

          l_rows = local_index(na, my_prow, np_rows, nblk, -1)
          allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
305
            stop 1
306 307 308 309 310
          endif

          vmrCols = na

#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
311
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
312 313
               &PRECISION&
               &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
314 315 316 317
                                 nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
318
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
319 320
               &PRECISION&
               &(obj, a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
321 322 323 324 325
                                 vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
                                 nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)
#endif

326
          work_size = int(dwork_size(1))
327 328 329
          allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating work_blocked "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
330
            stop 1
331
          endif
Pavel Kus's avatar
Pavel Kus committed
332
          work_blocked = 0.0_rk
333 334 335
          deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
336
            stop 1
337 338 339 340 341
          endif

        endif ! which_qr_decomposition

      endif ! useQr
Andreas Marek's avatar
Andreas Marek committed
342
#endif /* REALCASE */
343 344 345 346 347

      if (useGPU) then

        cur_l_rows = 0
        cur_l_cols = 0
Andreas Marek's avatar
Andreas Marek committed
348

349
        successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
350
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
351
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
352 353
                  &MATH_DATATYPE&
                  &: error in cudaMemcpy a_dev 2"
Andreas Marek's avatar
Andreas Marek committed
354
          stop 1
355
        endif
356 357 358 359 360 361 362 363 364 365 366
      endif ! useGPU


      do istep = (na-1)/nbw, 1, -1

        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step

        ! Number of local columns/rows of remaining matrix
        l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
        l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)

367 368
        ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces

369 370 371 372 373 374 375 376 377 378 379 380 381
        if (useGPU) then
          cur_l_rows = max(l_rows, 1)
          cur_l_cols = max(l_cols, 1)

          vmr_size = cur_l_rows * 2 * n_cols
          umc_size = cur_l_cols * 2 * n_cols

          ! Allocate vmr and umc only if the inew size exceeds their current capacity
          ! Added for FORTRAN CALLS
          if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
            if (allocated(vr)) then
              deallocate(vr, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
382
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
383 384
                        &MATH_DATATYPE&
                        &: error when deallocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
385
                stop 1
386 387 388 389
              endif
            endif
            allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
            if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
390
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
391 392
                      &MATH_DATATYPE&
                      &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
393
              stop 1
394 395 396 397 398 399 400 401
            endif

          endif

          if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then
            if (allocated(vmrCUDA)) then
              deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
402
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
403 404
                        &MATH_DATATYPE&
                        &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
405
                stop 1
406 407 408 409
              endif

              successCUDA = cuda_free(vmr_dev)
              if (.not.(successCUDA)) then
410
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
411
                        &MATH_DATATYPE&: error in cuda_free vmr_dev 1"
Andreas Marek's avatar
Andreas Marek committed
412
                stop 1
413 414 415 416
              endif
            endif

            allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage)
417

418
            if (istat .ne. 0) then
419
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
420 421
                      &MATH_DATATYPE&
                      &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
422
              stop 1
423
            endif
424
            successCUDA = cuda_malloc(vmr_dev, vmr_size* size_of_datatype)
425
            if (.not.(successCUDA)) then
426
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
427 428
                      &MATH_DATATYPE&
                      &: error in cudaMalloc: vmr_dev2"
Andreas Marek's avatar
Andreas Marek committed
429
              stop 1
430 431 432
            endif

          endif
433 434 435 436 437

          if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then
            if (allocated(umcCUDA)) then
              deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
438
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
439 440
                        &MATH_DATATYPE&
                        &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
441
                stop 1
442 443 444 445
              endif

              successCUDA = cuda_free(umc_dev)
              if (.not.(successCUDA)) then
446
                 print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
447 448
                         &MATH_DATATYPE&
                         &: error in cudaFree umc_dev 1"
Andreas Marek's avatar
Andreas Marek committed
449
                 stop 1
450 451 452
              endif

            endif
453

454
            allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
455

456
            if (istat .ne. 0) then
457
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
458 459
                      &MATH_DATATYPE&
                      &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
460
              stop 1
461 462
            endif

463
            successCUDA = cuda_malloc(umc_dev, umc_size* size_of_datatype)
464
            if (.not.(successCUDA)) then
465
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
466 467
                      &MATH_DATATYPE&
                      &: error in cudaMalloc umc_dev 2"
Andreas Marek's avatar
Andreas Marek committed
468
              stop 1
469
            endif
470

471
          endif
472 473 474

        else ! GPU not used

475 476 477
          ! unify the the name vmr and vmrCPU, as well as vmrGPU
          ! the same for umcCPU and umcGPU
          ! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces
478 479 480

          allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
481
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
482 483
                     &MATH_DATATYPE&
                     &: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
484
            stop 1
485 486
          endif

487 488
          allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
489
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
490 491
                    &MATH_DATATYPE&
                    &: error when allocating umcCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
492
            stop 1
493
          endif
Andreas Marek's avatar
Andreas Marek committed
494

495 496
          allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
497
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
498 499
                    &MATH_DATATYPE&
                    &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
500
            stop 1
501 502 503 504 505
          endif

        endif ! use GPU

        if (useGPU) then
Pavel Kus's avatar
Pavel Kus committed
506
          vmrCUDA(1 : cur_l_rows * n_cols) = 0.0_rck
507
        else
Pavel Kus's avatar
Pavel Kus committed
508
          vmrCPU(1:l_rows,1:n_cols) = 0.0_rck
509 510
        endif ! useGPU

Pavel Kus's avatar
Pavel Kus committed
511 512
        vr(:) = 0.0_rck
        tmat(:,:,istep) = 0.0_rck
513
        if (useGPU) then
514
#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
515
          umcCUDA(1 : umc_size) = 0.0_rck
516
#endif
517 518 519 520
          lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
          lc_end   = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
          lr_end   = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)

521
          if (lc_start .le. 0) lc_start = 1
522 523 524 525

          ! Here we assume that the processor grid and the block grid are aligned
          cur_pcol = pcol(istep*nbw+1, nblk, np_cols)

526 527
          if (my_pcol == cur_pcol) then
            successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), &
Andreas Marek's avatar
Retab  
Andreas Marek committed
528
                                      int((lda*size_of_datatype),kind=c_intptr_t), &
529 530 531 532
                                            (a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )),      &
                                            int(lda*size_of_datatype,kind=c_intptr_t),              &
                                            int(lr_end*size_of_datatype,kind=c_intptr_t),           &
                                             int((lc_end - lc_start+1),kind=c_intptr_t),int(cudaMemcpyDeviceToHost,kind=c_int))
533

534

535
            if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
536
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
537 538
                      &MATH_DATATYPE&
                      &: error in cudaMemcpy2d"
Andreas Marek's avatar
Andreas Marek committed
539
              stop 1
540 541 542 543 544
            endif
          endif
        endif ! useGPU

        ! Reduce current block to lower triangular form
545
#if REALCASE == 1
546
        if (useQR) then
Andreas Marek's avatar
Andreas Marek committed
547 548 549 550
          if (useGPU) then
 !            vmrCPU(1:cur_l_rows,1:n_cols) = vmrCUDA(1 : cur_l_rows * n_cols)
          endif

551 552 553
          if (which_qr_decomposition == 1) then
            vmrCols = 2*n_cols
#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
554
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
555 556
                 &PRECISION&
                 &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
557 558 559 560 561 562 563
                                   na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size,        &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
564
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
565 566
                 &PRECISION&
                 &(obj, a(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) ,   &
567 568 569 570 571 572 573 574 575 576
                                    max(l_rows,1), vmrCols, tauvector(1:na), na, &
                                     tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)
#endif
          endif

       else !useQR
577
#endif /* REALCASE == 1 */
578 579
         do lc = n_cols, 1, -1

580
           ncol = istep*nbw + lc ! absolute column number of householder Vector
581 582 583 584 585 586 587 588 589 590 591 592 593
           nrow = ncol - nbw ! Absolute number of pivot row

           lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
           lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number

           tau = 0

           if (nrow == 1) exit ! Nothing to do

           cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block

           if (my_pcol==cur_pcol) then

594
             ! Get Vector to be transformed; distribute last element and norm of
595 596
             ! remaining elements to all procs in current column

597
             vr(1:lr) = a(1:lr,lch) ! Vector to be transformed
598 599 600 601 602 603

             if (my_prow==prow(nrow, nblk, np_rows)) then
               aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
               aux1(2) = vr(lr)
             else
               aux1(1) = dot_product(vr(1:lr),vr(1:lr))
Pavel Kus's avatar
Pavel Kus committed
604
               aux1(2) = 0.0_rck
605 606 607
             endif

#ifdef WITH_MPI
608
             if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
609
             call mpi_allreduce(aux1, aux2, 2, MPI_MATH_DATATYPE_PRECISION, &
Andreas Marek's avatar
Andreas Marek committed
610
                                MPI_SUM, mpi_comm_rows, mpierr)
611
             if (wantDebug) call obj%timer%stop("mpi_communication")
612 613 614

#else /* WITH_MPI */
              aux2 = aux1 ! this should be optimized
Andreas Marek's avatar
Andreas Marek committed
615
#endif
616

617
#if REALCASE == 1
618
             vnorm2 = aux2(1)
619 620 621 622
#endif
#if COMPLEXCASE == 1
             vnorm2 = real(aux2(1),kind=rk)
#endif
623 624 625
             vrl    = aux2(2)

             ! Householder transformation
Pavel Kus's avatar
Pavel Kus committed
626 627 628
       call hh_transform_&
             &MATH_DATATYPE&
             &_&
Andreas Marek's avatar
Andreas Marek committed
629
             &PRECISION &
Pavel Kus's avatar
Pavel Kus committed
630
                         (obj, vrl, vnorm2, xf, tau, wantDebug)
631
             ! Scale vr and store Householder Vector for back transformation
632 633 634 635 636

             vr(1:lr) = vr(1:lr) * xf
             if (my_prow==prow(nrow, nblk, np_rows)) then
               a(1:lr-1,lch) = vr(1:lr-1)
               a(lr,lch) = vrl
Pavel Kus's avatar
Pavel Kus committed
637
               vr(lr) = 1.0_rck
638 639 640 641 642 643
             else
               a(1:lr,lch) = vr(1:lr)
             endif

           endif

644
           ! Broadcast Householder Vector and tau along columns
645 646 647

           vr(lr+1) = tau
#ifdef WITH_MPI
648
           if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
649
           call MPI_Bcast(vr, lr+1, MPI_MATH_DATATYPE_PRECISION, &
Andreas Marek's avatar
Andreas Marek committed
650
                          cur_pcol, mpi_comm_cols, mpierr)
651
           if (wantDebug) call obj%timer%stop("mpi_communication")
652 653

#endif /* WITH_MPI */
654

Andreas Marek's avatar
Andreas Marek committed
655
           if (useGPU_reduction_lower_block_to_tridiagonal) then
656
             vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
657 658 659
           else
             vmrCPU(1:lr,lc) = vr(1:lr)
           endif
660 661
           tau = vr(lr+1)

662 663 664 665 666 667
#if REALCASE == 1
           tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
#endif
#if COMPLEXCASE == 1
           tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
#endif
668
           ! Transform remaining columns in current block with Householder Vector
669 670
           ! Local dot product

Pavel Kus's avatar
Pavel Kus committed
671
           aux1 = 0.0_rck
672 673

#ifdef WITH_OPENMP
674 675 676
#if 0
 ! original complex implementation without openmp. check performance
            nlc = 0 ! number of local columns
677 678 679 680 681 682 683 684 685 686
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
687
           if (wantDebug) call obj%timer%start("mpi_communication")
688 689 690 691 692 693 694 695 696 697
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
Andreas Marek's avatar
Andreas Marek committed
698

699 700 701
             endif
           enddo

702

703
           if (wantDebug) call obj%timer%stop("mpi_communication")
704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728

#else /* WITH_MPI */
!          if (nlc>0) aux2=aux1

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
             endif
           enddo

#endif /* WITH_MPI */
!
!           ! Transform
!
!           nlc = 0
!           do j=1,lc-1
!             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
!             if (lcx>0) then
!               nlc = nlc+1
!               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
729

730 731
!             endif
!           enddo
732
#endif /* if 0 */
733

734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760
           !Open up one omp region to avoid paying openmp overhead.
           !This does not help performance due to the addition of two openmp barriers around the MPI call,
           !But in the future this may be beneficial if these barriers are replaced with a faster implementation

           !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
           mynlc = 0 ! number of local columns

           !This loop does not have independent iterations,
           !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
           !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
           !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
           !the iteration number modulo the number of threads
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0 ) then
               mynlc = mynlc+1
               if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
                   if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
               endif
             endif
           enddo

           ! Get global dot products

           !$omp barrier
           !$omp single
#ifdef WITH_MPI
761
           if (wantDebug) call obj%timer%start("mpi_communication")
762
           if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, MPI_MATH_DATATYPE_PRECISION, &
Pavel Kus's avatar
Pavel Kus committed
763
                                           MPI_SUM, mpi_comm_rows, mpierr)
764
           if (wantDebug) call obj%timer%stop("mpi_communication")
765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793
#else /* WITH_MPI */
           if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
           !$omp end single
           !$omp barrier

           ! Transform
           transformChunkSize=32
           mynlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               mynlc = mynlc+1
               !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
               !However, for some reason this is slower than doing it manually, so it is parallelized as below.
               do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
                  do pp = 1,transformChunkSize
                      if (pp + ii > lr) exit
#if REALCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
#endif
#if COMPLEXCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp)
#endif
                  enddo
               enddo
             endif
           enddo
           !$omp end parallel
794 795 796 797 798 799 800 801 802 803 804 805 806 807

#else /* WITH_OPENMP */

           nlc = 0 ! number of local columns
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
808
           if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
809 810
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_MATH_DATATYPE_PRECISION, &
                                         MPI_SUM, mpi_comm_rows, mpierr)
811
           if (wantDebug) call obj%timer%stop("mpi_communication")
812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830
#else /* WITH_MPI */
           if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
#if REALCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
#endif
#if COMPLEXCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
#endif
             endif
           enddo
#endif /* WITH_OPENMP */
831 832
         enddo ! lc

Andreas Marek's avatar
Andreas Marek committed
833
         if (useGPU_reduction_lower_block_to_tridiagonal) then
834 835 836
           ! store column tiles back to GPU
           cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
           if (my_pcol == cur_pcol) then
837
             successCUDA = cuda_memcpy2d((a_dev+        &
838 839 840 841 842
                                         int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)),    &
                                         int(lda*size_of_datatype,kind=c_intptr_t), loc(a(1,lc_start)), &
                                         int(lda*size_of_datatype,kind=c_intptr_t),           &
                                         int(lr_end*size_of_datatype,kind=c_intptr_t),        &
                                         int((lc_end - lc_start+1),kind=c_intptr_t), &
843
                                         int(cudaMemcpyHostToDevice,kind=c_int))
844

845
             if (.not.(successCUDA)) then
846
               print *, "bandred_&
847 848
                        &MATH_DATATYPE&
                        &: cuda memcpy a_dev  failed ", istat
Andreas Marek's avatar
Andreas Marek committed
849
               stop 1
850
             endif
851 852 853 854 855 856 857
           endif
         endif

         ! Calculate scalar products of stored Householder vectors.
         ! This can be done in different ways, we use dsyrk

         vav = 0
858
         call obj%timer%start("blas")
Andreas Marek's avatar
Andreas Marek committed
859
         if (useGPU_reduction_lower_block_to_tridiagonal) then
860
           if (l_rows>0) &
Andreas Marek's avatar
Andreas Marek committed
861 862 863 864 865 866
#if REALCASE == 1
             call PRECISION_SYRK('U', 'T',            &
#endif
#if COMPLEXCASE == 1
             call PRECISION_HERK('U', 'C',            &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
867
                           n_cols, l_rows, ONE, &
868 869 870
                           vmrCUDA, cur_l_rows, &
                           ZERO, vav, ubound(vav,dim=1))

Andreas Marek's avatar
Andreas Marek committed
871
         else ! useGPU_reduction_to_tridiagonal
872
           if (l_rows>0) &
Andreas Marek's avatar
Andreas Marek committed
873 874
#if REALCASE == 1
             call PRECISION_SYRK('U', 'T',           &
875 876
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
877
             call PRECISION_HERK('U', 'C',           &
878
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
879
                           n_cols, l_rows, ONE, vmrCPU, ubound(vmrCPU,dim=1), ZERO, vav, ubound(vav,dim=1))
Andreas Marek's avatar
Andreas Marek committed
880
         endif
881
         call obj%timer%stop("blas")
882
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
883
         call symm_matrix_allreduce_&
884 885
#endif
#if COMPLEXCASE == 1
886
         call herm_matrix_allreduce_&
887
#endif
Andreas Marek's avatar
Andreas Marek committed
888
         &PRECISION &
889
                         (obj, n_cols,vav, nbw, nbw,mpi_comm_rows)
890
         ! Calculate triangular matrix T for block Householder Transformation
891
         call obj%timer%start("blas")
892 893 894
         do lc=n_cols,1,-1
           tau = tmat(lc,lc,istep)
           if (lc<n_cols) then