elpa2_bandred_template.F90 62 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
63 64 65 66
    subroutine bandred_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
67
    (obj, na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
Andreas Marek's avatar
Andreas Marek committed
68
     tmat_dev, wantDebug, useGPU, success &
69
#if REALCASE == 1
70
     , useQR)
71 72
#endif
#if COMPLEXCASE == 1
73
     )
74
#endif
75

76
  !-------------------------------------------------------------------------------
77
  !  bandred_real/complex: Reduces a distributed symmetric matrix to band form
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
  !
  !  Parameters
  !
  !  na          Order of matrix
  !
  !  a(lda,matrixCols)    Distributed matrix which should be reduced.
  !              Distribution is like in Scalapack.
  !              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
  !              a(:,:) is overwritten on exit with the band and the Householder vectors
  !              in the upper half.
  !
  !  lda         Leading dimension of a
  !  matrixCols  local columns of matrix a
  !
  !  nblk        blocksize of cyclic distribution, must be the same in both directions!
  !
  !  nbw         semi bandwith of output matrix
  !
  !  mpi_comm_rows
  !  mpi_comm_cols
  !              MPI-Communicators for rows/columns
  !
  !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
  !              Factors for the Householder vectors (returned), needed for back transformation
  !
  !-------------------------------------------------------------------------------

      use cuda_functions
      use iso_c_binding
      use elpa1_compute
#ifdef WITH_OPENMP
      use omp_lib
#endif
      use precision
112
      use elpa_abstract_impl
113
      implicit none
114
#include "../general/precision_kinds.F90"
115
      class(elpa_abstract_impl_t), intent(inout) :: obj
116 117 118
      integer(kind=ik)                            :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols

#ifdef USE_ASSUMED_SIZE
119
      MATH_DATATYPE(kind=rck)                    :: a(lda,*), tmat(nbw,nbw,*)
120
#else
121
      MATH_DATATYPE(kind=rck)                    :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
122
#endif
Andreas Marek's avatar
Andreas Marek committed
123 124

#if REALCASE == 1
125
      real(kind=rk)                               :: eps
126 127
#endif
      logical, intent(in)                         :: useGPU
128

129 130 131
      integer(kind=ik)                            :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)                            :: l_cols, l_rows
#if REALCASE == 1
132
      integer(kind=ik)                            :: vmrCols
133
#endif
Andreas Marek's avatar
Andreas Marek committed
134 135 136 137
#ifdef WITH_OPENMP
      integer(kind=ik)                            :: mynlc, lrs, transformChunkSize
#endif
      integer(kind=ik)                            :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
138 139
      integer(kind=ik)                            :: istep, ncol, lch, lcx, nlc
      integer(kind=ik)                            :: tile_size, l_rows_tile, l_cols_tile
140

141 142
      real(kind=rk)                    :: vnorm2
      MATH_DATATYPE(kind=rck)                    :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
143

144
!      complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
145 146 147
      MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:),  vmrCUDA(:),  umcCUDA(:)
      MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
      MATH_DATATYPE(kind=rck), allocatable :: vr(:)
148 149 150 151

#if REALCASE == 1
      ! needed for blocked QR decomposition
      integer(kind=ik)                            :: PQRPARAM(11), work_size
152 153
      real(kind=rk)                    :: dwork_size(1)
      real(kind=rk), allocatable       :: work_blocked(:), tauvector(:), blockheuristic(:)
154
#endif
155
      ! a_dev is passed from bandred_real to trans_ev_band
156
      integer(kind=C_intptr_T)                    :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
157
#ifdef WITH_MPI
158 159 160 161
      integer(kind=ik), external                  :: numroc
#endif
      integer(kind=ik)                            :: ierr
      integer(kind=ik)                            :: cur_l_rows, cur_l_cols, vmr_size, umc_size
Andreas Marek's avatar
Andreas Marek committed
162
      integer(kind=c_intptr_t)                    :: lc_start, lc_end
163
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
164
      integer(kind=c_intptr_t)                    :: lce_1, lcs_1, lre_1
165 166 167 168 169
#endif
      integer(kind=ik)                            :: lr_end
      integer(kind=ik)                            :: na_cols
#if COMPLEXCASE == 1
      integer(kind=ik)                            :: na_rows
170 171
#endif

172 173 174 175 176
      logical, intent(in)                         :: wantDebug
      logical, intent(out)                        :: success
      logical                                     :: successCUDA
      integer(kind=ik)                            :: istat
      character(200)                              :: errorMessage
177
      integer(kind=ik)                            :: min_tile_size, error
178

179 180 181 182
#if REALCASE == 1
      logical, intent(in)                         :: useQR
#endif
      integer(kind=ik)                            :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
183
                                                    ii, pp
184
      integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
185 186 187
                                                                        &PRECISION&
                                                                        &_&
                                                                        &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
188

Andreas Marek's avatar
Andreas Marek committed
189 190 191
      logical                                     :: useGPU_reduction_lower_block_to_tridiagonal


192
      call obj%timer%start("bandred_&
Andreas Marek's avatar
Andreas Marek committed
193 194 195 196
      &MATH_DATATYPE&
      &" // &
      &PRECISION_SUFFIX &
      )
Andreas Marek's avatar
Andreas Marek committed
197
      useGPU_reduction_lower_block_to_tridiagonal = .false.
Andreas Marek's avatar
Andreas Marek committed
198 199 200 201 202 203 204 205 206 207 208 209

      if (useGPU) then
        useGPU_reduction_lower_block_to_tridiagonal = .true.
#if REALCASE == 1
        if (useQR) then
          !in this case switch off GPU usage for step "reduce current block to lower triangular form"
          ! since this is done by QR decomposition
          useGPU_reduction_lower_block_to_tridiagonal = .false.
        endif
#endif
      endif

210
      if (wantDebug) call obj%timer%start("mpi_communication")
211 212 213 214 215

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
Andreas Marek's avatar
Andreas Marek committed
216

217
      if (wantDebug) call obj%timer%stop("mpi_communication")
218 219 220 221 222 223 224
      success = .true.


      ! Semibandwith nbw must be a multiple of blocksize nblk
      if (mod(nbw,nblk)/=0) then
        if (my_prow==0 .and. my_pcol==0) then
          if (wantDebug) then
Andreas Marek's avatar
Andreas Marek committed
225
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Andreas Marek committed
226 227
                                 &MATH_DATATYPE&
                                 &: ERROR: nbw=',nbw,', nblk=',nblk
Andreas Marek's avatar
Andreas Marek committed
228
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Andreas Marek committed
229 230
                                 &MATH_DATATYPE&
                                 &: ELPA2 works only for nbw==n*nblk'
231 232 233 234 235 236
          endif
          success = .false.
          return
        endif
      endif

Andreas Marek's avatar
Andreas Marek committed
237
      ! na_rows in used nowhere; only na_cols
238 239
      if (useGPU) then
#ifdef WITH_MPI
240
#if COMPLEXCASE == 1
241
        na_rows = numroc(na, nblk, my_prow, 0, np_rows)
242
#endif
243 244
        na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
245 246 247
#if COMPLEXCASE == 1
         na_rows = na
#endif
248
        na_cols = na
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
249
#endif /* WITH_MPI */
250 251

        ! Here we convert the regular host array into a pinned host array
252
        successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype)
253
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
254
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
255 256
                  &MATH_DATATYPE&
                  &: error in cudaMalloc a_dev 1"
Andreas Marek's avatar
Andreas Marek committed
257
          stop 1
258 259
        endif

260
        successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
261
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
262
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
263 264
                  &MATH_DATATYPE&
                  &: error in cudaMalloc tmat_dev 1"
Andreas Marek's avatar
Andreas Marek committed
265
          stop 1
266 267
        endif

268
        successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
269
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
270
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
271 272
                  &MATH_DATATYPE&
                  &: error in cudaMalloc vav_dev 1"
Andreas Marek's avatar
Andreas Marek committed
273
          stop 1
274
        endif
275 276 277 278 279
      endif ! useGPU

      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
280 281 282 283 284 285 286 287 288 289 290 291 292 293 294

      ! make tile_size a smallest possible multiple of previously defined tile size, such that it is
      ! larger or equal to min_tile_size
      ! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value
      ! it can, however, be set by the user
      call obj%get("min_tile_size", min_tile_size ,error)
      if (error .ne. ELPA_OK) then
        print *,"Problem setting option. Aborting..."
        stop
      endif
      if(min_tile_size == 0) then
        ! not set by the user, use the default value
        min_tile_size = 128*max(np_rows, np_cols)
      endif
      tile_size = ((min_tile_size-1)/tile_size+1)*tile_size
295 296 297 298

      l_rows_tile = tile_size/np_rows ! local rows of a tile
      l_cols_tile = tile_size/np_cols ! local cols of a tile

299
#if REALCASE == 1
300 301 302
      if (useQR) then

        if (which_qr_decomposition == 1) then
303
          call qr_pqrparam_init(obj,pqrparam(1:11),    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
304 305 306
          allocate(tauvector(na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating tauvector "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
307
            stop 1
308 309 310 311 312
          endif

          allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating blockheuristic "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
313
            stop 1
314 315 316 317 318 319
          endif

          l_rows = local_index(na, my_prow, np_rows, nblk, -1)
          allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
320
            stop 1
321 322 323 324 325
          endif

          vmrCols = na

#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
326
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
327 328
               &PRECISION&
               &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
329 330 331 332
                                 nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
333
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
334 335
               &PRECISION&
               &(obj, a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
336 337 338 339 340
                                 vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
                                 nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)
#endif

341
          work_size = int(dwork_size(1))
342 343 344
          allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating work_blocked "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
345
            stop 1
346
          endif
Pavel Kus's avatar
Pavel Kus committed
347
          work_blocked = 0.0_rk
348 349 350
          deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
351
            stop 1
352 353 354 355 356
          endif

        endif ! which_qr_decomposition

      endif ! useQr
Andreas Marek's avatar
Andreas Marek committed
357
#endif /* REALCASE */
358 359 360 361 362

      if (useGPU) then

        cur_l_rows = 0
        cur_l_cols = 0
Andreas Marek's avatar
Andreas Marek committed
363

364
        successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
365
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
366
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
367 368
                  &MATH_DATATYPE&
                  &: error in cudaMemcpy a_dev 2"
Andreas Marek's avatar
Andreas Marek committed
369
          stop 1
370
        endif
371 372 373 374 375 376 377 378 379 380 381
      endif ! useGPU


      do istep = (na-1)/nbw, 1, -1

        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step

        ! Number of local columns/rows of remaining matrix
        l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
        l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)

382 383
        ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces

384 385 386 387 388 389 390 391 392 393 394 395 396
        if (useGPU) then
          cur_l_rows = max(l_rows, 1)
          cur_l_cols = max(l_cols, 1)

          vmr_size = cur_l_rows * 2 * n_cols
          umc_size = cur_l_cols * 2 * n_cols

          ! Allocate vmr and umc only if the inew size exceeds their current capacity
          ! Added for FORTRAN CALLS
          if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
            if (allocated(vr)) then
              deallocate(vr, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
397
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
398 399
                        &MATH_DATATYPE&
                        &: error when deallocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
400
                stop 1
401 402 403 404
              endif
            endif
            allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
            if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
405
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
406 407
                      &MATH_DATATYPE&
                      &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
408
              stop 1
409 410 411 412 413 414 415 416
            endif

          endif

          if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then
            if (allocated(vmrCUDA)) then
              deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
417
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
418 419
                        &MATH_DATATYPE&
                        &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
420
                stop 1
421 422 423 424
              endif

              successCUDA = cuda_free(vmr_dev)
              if (.not.(successCUDA)) then
425
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
426
                        &MATH_DATATYPE&: error in cuda_free vmr_dev 1"
Andreas Marek's avatar
Andreas Marek committed
427
                stop 1
428 429 430 431
              endif
            endif

            allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage)
432

433
            if (istat .ne. 0) then
434
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
435 436
                      &MATH_DATATYPE&
                      &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
437
              stop 1
438
            endif
439
            successCUDA = cuda_malloc(vmr_dev, vmr_size* size_of_datatype)
440
            if (.not.(successCUDA)) then
441
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
442 443
                      &MATH_DATATYPE&
                      &: error in cudaMalloc: vmr_dev2"
Andreas Marek's avatar
Andreas Marek committed
444
              stop 1
445 446 447
            endif

          endif
448 449 450 451 452

          if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then
            if (allocated(umcCUDA)) then
              deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
453
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
454 455
                        &MATH_DATATYPE&
                        &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
456
                stop 1
457 458 459 460
              endif

              successCUDA = cuda_free(umc_dev)
              if (.not.(successCUDA)) then
461
                 print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
462 463
                         &MATH_DATATYPE&
                         &: error in cudaFree umc_dev 1"
Andreas Marek's avatar
Andreas Marek committed
464
                 stop 1
465 466 467
              endif

            endif
468

469
            allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
470

471
            if (istat .ne. 0) then
472
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
473 474
                      &MATH_DATATYPE&
                      &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
475
              stop 1
476 477
            endif

478
            successCUDA = cuda_malloc(umc_dev, umc_size* size_of_datatype)
479
            if (.not.(successCUDA)) then
480
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
481 482
                      &MATH_DATATYPE&
                      &: error in cudaMalloc umc_dev 2"
Andreas Marek's avatar
Andreas Marek committed
483
              stop 1
484
            endif
485

486
          endif
487 488 489

        else ! GPU not used

490 491 492
          ! unify the the name vmr and vmrCPU, as well as vmrGPU
          ! the same for umcCPU and umcGPU
          ! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces
493 494 495

          allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
496
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
497 498
                     &MATH_DATATYPE&
                     &: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
499
            stop 1
500 501
          endif

502 503
          allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
504
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
505 506
                    &MATH_DATATYPE&
                    &: error when allocating umcCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
507
            stop 1
508
          endif
Andreas Marek's avatar
Andreas Marek committed
509

510 511
          allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
512
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
513 514
                    &MATH_DATATYPE&
                    &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
515
            stop 1
516 517 518 519 520
          endif

        endif ! use GPU

        if (useGPU) then
Pavel Kus's avatar
Pavel Kus committed
521
          vmrCUDA(1 : cur_l_rows * n_cols) = 0.0_rck
522
        else
Pavel Kus's avatar
Pavel Kus committed
523
          vmrCPU(1:l_rows,1:n_cols) = 0.0_rck
524 525
        endif ! useGPU

Pavel Kus's avatar
Pavel Kus committed
526 527
        vr(:) = 0.0_rck
        tmat(:,:,istep) = 0.0_rck
528
        if (useGPU) then
529
#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
530
          umcCUDA(1 : umc_size) = 0.0_rck
531
#endif
532 533 534 535
          lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
          lc_end   = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
          lr_end   = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)

536
          if (lc_start .le. 0) lc_start = 1
537 538 539 540

          ! Here we assume that the processor grid and the block grid are aligned
          cur_pcol = pcol(istep*nbw+1, nblk, np_cols)

541 542
          if (my_pcol == cur_pcol) then
            successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), &
Andreas Marek's avatar
Retab  
Andreas Marek committed
543
                                      int((lda*size_of_datatype),kind=c_intptr_t), &
544 545 546 547
                                            (a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )),      &
                                            int(lda*size_of_datatype,kind=c_intptr_t),              &
                                            int(lr_end*size_of_datatype,kind=c_intptr_t),           &
                                             int((lc_end - lc_start+1),kind=c_intptr_t),int(cudaMemcpyDeviceToHost,kind=c_int))
548

549

550
            if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
551
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
552 553
                      &MATH_DATATYPE&
                      &: error in cudaMemcpy2d"
Andreas Marek's avatar
Andreas Marek committed
554
              stop 1
555 556 557 558 559
            endif
          endif
        endif ! useGPU

        ! Reduce current block to lower triangular form
560
#if REALCASE == 1
561
        if (useQR) then
Andreas Marek's avatar
Andreas Marek committed
562 563 564 565
          if (useGPU) then
 !            vmrCPU(1:cur_l_rows,1:n_cols) = vmrCUDA(1 : cur_l_rows * n_cols)
          endif

566 567 568
          if (which_qr_decomposition == 1) then
            vmrCols = 2*n_cols
#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
569
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
570 571
                 &PRECISION&
                 &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
572 573 574 575 576 577 578
                                   na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size,        &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
579
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
580 581
                 &PRECISION&
                 &(obj, a(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) ,   &
582 583 584 585 586 587 588 589 590 591
                                    max(l_rows,1), vmrCols, tauvector(1:na), na, &
                                     tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)
#endif
          endif

       else !useQR
592
#endif /* REALCASE == 1 */
593 594
         do lc = n_cols, 1, -1

595
           ncol = istep*nbw + lc ! absolute column number of householder Vector
596 597 598 599 600 601 602 603 604 605 606 607 608
           nrow = ncol - nbw ! Absolute number of pivot row

           lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
           lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number

           tau = 0

           if (nrow == 1) exit ! Nothing to do

           cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block

           if (my_pcol==cur_pcol) then

609
             ! Get Vector to be transformed; distribute last element and norm of
610 611
             ! remaining elements to all procs in current column

612
             vr(1:lr) = a(1:lr,lch) ! Vector to be transformed
613 614 615 616 617 618

             if (my_prow==prow(nrow, nblk, np_rows)) then
               aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
               aux1(2) = vr(lr)
             else
               aux1(1) = dot_product(vr(1:lr),vr(1:lr))
Pavel Kus's avatar
Pavel Kus committed
619
               aux1(2) = 0.0_rck
620 621 622
             endif

#ifdef WITH_MPI
623
             if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
624
             call mpi_allreduce(aux1, aux2, 2, MPI_MATH_DATATYPE_PRECISION, &
Andreas Marek's avatar
Andreas Marek committed
625
                                MPI_SUM, mpi_comm_rows, mpierr)
626
             if (wantDebug) call obj%timer%stop("mpi_communication")
627 628 629

#else /* WITH_MPI */
              aux2 = aux1 ! this should be optimized
Andreas Marek's avatar
Andreas Marek committed
630
#endif
631

632
#if REALCASE == 1
633
             vnorm2 = aux2(1)
634 635 636 637
#endif
#if COMPLEXCASE == 1
             vnorm2 = real(aux2(1),kind=rk)
#endif
638 639 640
             vrl    = aux2(2)

             ! Householder transformation
Pavel Kus's avatar
Pavel Kus committed
641 642 643
       call hh_transform_&
             &MATH_DATATYPE&
             &_&
Andreas Marek's avatar
Andreas Marek committed
644
             &PRECISION &
Pavel Kus's avatar
Pavel Kus committed
645
                         (obj, vrl, vnorm2, xf, tau, wantDebug)
646
             ! Scale vr and store Householder Vector for back transformation
647 648 649 650 651

             vr(1:lr) = vr(1:lr) * xf
             if (my_prow==prow(nrow, nblk, np_rows)) then
               a(1:lr-1,lch) = vr(1:lr-1)
               a(lr,lch) = vrl
Pavel Kus's avatar
Pavel Kus committed
652
               vr(lr) = 1.0_rck
653 654 655 656 657 658
             else
               a(1:lr,lch) = vr(1:lr)
             endif

           endif

659
           ! Broadcast Householder Vector and tau along columns
660 661 662

           vr(lr+1) = tau
#ifdef WITH_MPI
663
           if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
664
           call MPI_Bcast(vr, lr+1, MPI_MATH_DATATYPE_PRECISION, &
Andreas Marek's avatar
Andreas Marek committed
665
                          cur_pcol, mpi_comm_cols, mpierr)
666
           if (wantDebug) call obj%timer%stop("mpi_communication")
667 668

#endif /* WITH_MPI */
669

Andreas Marek's avatar
Andreas Marek committed
670
           if (useGPU_reduction_lower_block_to_tridiagonal) then
671
             vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
672 673 674
           else
             vmrCPU(1:lr,lc) = vr(1:lr)
           endif
675 676
           tau = vr(lr+1)

677 678 679 680 681 682
#if REALCASE == 1
           tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
#endif
#if COMPLEXCASE == 1
           tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
#endif
683
           ! Transform remaining columns in current block with Householder Vector
684 685
           ! Local dot product

Pavel Kus's avatar
Pavel Kus committed
686
           aux1 = 0.0_rck
687 688

#ifdef WITH_OPENMP
689 690 691
#if 0
 ! original complex implementation without openmp. check performance
            nlc = 0 ! number of local columns
692 693 694 695 696 697 698 699 700 701
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
702
           if (wantDebug) call obj%timer%start("mpi_communication")
703 704 705 706 707 708 709 710 711 712
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
Andreas Marek's avatar
Andreas Marek committed
713

714 715 716
             endif
           enddo

717

718
           if (wantDebug) call obj%timer%stop("mpi_communication")
719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743

#else /* WITH_MPI */
!          if (nlc>0) aux2=aux1

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
             endif
           enddo

#endif /* WITH_MPI */
!
!           ! Transform
!
!           nlc = 0
!           do j=1,lc-1
!             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
!             if (lcx>0) then
!               nlc = nlc+1
!               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
744

745 746
!             endif
!           enddo
747
#endif /* if 0 */
748

749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775
           !Open up one omp region to avoid paying openmp overhead.
           !This does not help performance due to the addition of two openmp barriers around the MPI call,
           !But in the future this may be beneficial if these barriers are replaced with a faster implementation

           !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
           mynlc = 0 ! number of local columns

           !This loop does not have independent iterations,
           !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
           !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
           !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
           !the iteration number modulo the number of threads
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0 ) then
               mynlc = mynlc+1
               if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
                   if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
               endif
             endif
           enddo

           ! Get global dot products

           !$omp barrier
           !$omp single
#ifdef WITH_MPI
776
           if (wantDebug) call obj%timer%start("mpi_communication")
777
           if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, MPI_MATH_DATATYPE_PRECISION, &
Pavel Kus's avatar
Pavel Kus committed
778
                                           MPI_SUM, mpi_comm_rows, mpierr)
779
           if (wantDebug) call obj%timer%stop("mpi_communication")
780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808
#else /* WITH_MPI */
           if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
           !$omp end single
           !$omp barrier

           ! Transform
           transformChunkSize=32
           mynlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               mynlc = mynlc+1
               !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
               !However, for some reason this is slower than doing it manually, so it is parallelized as below.
               do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
                  do pp = 1,transformChunkSize
                      if (pp + ii > lr) exit
#if REALCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
#endif
#if COMPLEXCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp)
#endif
                  enddo
               enddo
             endif
           enddo
           !$omp end parallel
809 810 811 812 813 814 815 816 817 818 819 820 821 822

#else /* WITH_OPENMP */

           nlc = 0 ! number of local columns
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
823
           if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
824 825
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_MATH_DATATYPE_PRECISION, &
                                         MPI_SUM, mpi_comm_rows, mpierr)
826
           if (wantDebug) call obj%timer%stop("mpi_communication")
827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845
#else /* WITH_MPI */
           if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
#if REALCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
#endif
#if COMPLEXCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
#endif
             endif
           enddo
#endif /* WITH_OPENMP */
846 847
         enddo ! lc

Andreas Marek's avatar
Andreas Marek committed
848
         if (useGPU_reduction_lower_block_to_tridiagonal) then
849 850 851
           ! store column tiles back to GPU
           cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
           if (my_pcol == cur_pcol) then
852
             successCUDA = cuda_memcpy2d((a_dev+        &
853 854 855 856 857
                                         int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)),    &
                                         int(lda*size_of_datatype,kind=c_intptr_t), loc(a(1,lc_start)), &
                                         int(lda*size_of_datatype,kind=c_intptr_t),           &
                                         int(lr_end*size_of_datatype,kind=c_intptr_t),        &
                                         int((lc_end - lc_start+1),kind=c_intptr_t), &
858
                                         int(cudaMemcpyHostToDevice,kind=c_int))
859

860
             if (.not.(successCUDA)) then
861
               print *, "bandred_&
862 863
                        &MATH_DATATYPE&
                        &: cuda memcpy a_dev  failed ", istat
Andreas Marek's avatar
Andreas Marek committed
864
               stop 1
865
             endif
866 867 868 869 870 871 872
           endif
         endif

         ! Calculate scalar products of stored Householder vectors.
         ! This can be done in different ways, we use dsyrk

         vav = 0
873
         call obj%timer%start("blas")
Andreas Marek's avatar
Andreas Marek committed
874
         if (useGPU_reduction_lower_block_to_tridiagonal) then
875
           if (l_rows>0) &
Andreas Marek's avatar
Andreas Marek committed
876 877 878 879 880 881
#if REALCASE == 1
             call PRECISION_SYRK('U', 'T',            &
#endif
#if COMPLEXCASE == 1
             call PRECISION_HERK('U', 'C',            &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
882
                           n_cols, l_rows, ONE, &
883 884 885
                           vmrCUDA, cur_l_rows, &
                           ZERO, vav, ubound(vav,dim=1))

Andreas Marek's avatar
Andreas Marek committed
886
         else ! useGPU_reduction_to_tridiagonal
887
           if (l_rows>0) &
Andreas Marek's avatar
Andreas Marek committed
888 889
#if REALCASE == 1
             call PRECISION_SYRK('U', 'T',           &
890 891
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
892
             call PRECISION_HERK('U', 'C',           &
893
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
894
                           n_cols, l_rows, ONE, vmrCPU, ubound(vmrCPU,dim=1), ZERO, vav, ubound(vav,dim=1))
Andreas Marek's avatar
Andreas Marek committed
895
         endif
896
         call obj%timer%stop("blas")
897
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
898
         call symm_matrix_allreduce_&
899 900
#endif
#if COMPLEXCASE == 1
901
         call herm_matrix_allreduce_&
902
#endif
Andreas Marek's avatar