elpa2_bandred_template.F90 63.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
63 64 65 66
    subroutine bandred_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
67
    (obj, na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
Andreas Marek's avatar
Andreas Marek committed
68
     tmat_dev, wantDebug, useGPU, success &
69
#if REALCASE == 1
70
     , useQR)
71 72
#endif
#if COMPLEXCASE == 1
73
     )
74
#endif
75

76
  !-------------------------------------------------------------------------------
77
  !  bandred_real/complex: Reduces a distributed symmetric matrix to band form
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
  !
  !  Parameters
  !
  !  na          Order of matrix
  !
  !  a(lda,matrixCols)    Distributed matrix which should be reduced.
  !              Distribution is like in Scalapack.
  !              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
  !              a(:,:) is overwritten on exit with the band and the Householder vectors
  !              in the upper half.
  !
  !  lda         Leading dimension of a
  !  matrixCols  local columns of matrix a
  !
  !  nblk        blocksize of cyclic distribution, must be the same in both directions!
  !
  !  nbw         semi bandwith of output matrix
  !
  !  mpi_comm_rows
  !  mpi_comm_cols
  !              MPI-Communicators for rows/columns
  !
  !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
  !              Factors for the Householder vectors (returned), needed for back transformation
  !
  !-------------------------------------------------------------------------------

      use cuda_functions
      use iso_c_binding
      use elpa1_compute
#ifdef WITH_OPENMP
      use omp_lib
#endif
      use precision
112
      use elpa_abstract_impl
113
      implicit none
114
#include "../general/precision_kinds.F90"
115
      class(elpa_abstract_impl_t), intent(inout) :: obj
116 117 118
      integer(kind=ik)                            :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols

#ifdef USE_ASSUMED_SIZE
119
      MATH_DATATYPE(kind=rck)                    :: a(lda,*), tmat(nbw,nbw,*)
120
#else
121
      MATH_DATATYPE(kind=rck)                    :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
122
#endif
Andreas Marek's avatar
Andreas Marek committed
123 124

#if REALCASE == 1
125
      real(kind=rk)                               :: eps
126 127
#endif
      logical, intent(in)                         :: useGPU
128

129 130 131
      integer(kind=ik)                            :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)                            :: l_cols, l_rows
#if REALCASE == 1
132
      integer(kind=ik)                            :: vmrCols
133
#endif
Andreas Marek's avatar
Andreas Marek committed
134 135 136 137
#ifdef WITH_OPENMP
      integer(kind=ik)                            :: mynlc, lrs, transformChunkSize
#endif
      integer(kind=ik)                            :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
138 139
      integer(kind=ik)                            :: istep, ncol, lch, lcx, nlc
      integer(kind=ik)                            :: tile_size, l_rows_tile, l_cols_tile
140

141 142
      real(kind=rk)                    :: vnorm2
      MATH_DATATYPE(kind=rck)                    :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
143

144
!      complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
145 146 147
      MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:),  vmrCUDA(:),  umcCUDA(:)
      MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
      MATH_DATATYPE(kind=rck), allocatable :: vr(:)
148 149 150 151

#if REALCASE == 1
      ! needed for blocked QR decomposition
      integer(kind=ik)                            :: PQRPARAM(11), work_size
152 153
      real(kind=rk)                    :: dwork_size(1)
      real(kind=rk), allocatable       :: work_blocked(:), tauvector(:), blockheuristic(:)
154
#endif
155
      ! a_dev is passed from bandred_real to trans_ev_band
156
      integer(kind=C_intptr_T)                    :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
157
#ifdef WITH_MPI
158 159 160 161
      integer(kind=ik), external                  :: numroc
#endif
      integer(kind=ik)                            :: ierr
      integer(kind=ik)                            :: cur_l_rows, cur_l_cols, vmr_size, umc_size
162
      integer(kind=c_intptr_t)                      :: lc_start, lc_end
163
#if COMPLEXCASE == 1
164
      integer(kind=c_intptr_t)                      :: lce_1, lcs_1, lre_1
165 166 167 168 169
#endif
      integer(kind=ik)                            :: lr_end
      integer(kind=ik)                            :: na_cols
#if COMPLEXCASE == 1
      integer(kind=ik)                            :: na_rows
170 171
#endif

172 173 174 175 176
      logical, intent(in)                         :: wantDebug
      logical, intent(out)                        :: success
      logical                                     :: successCUDA
      integer(kind=ik)                            :: istat
      character(200)                              :: errorMessage
177

178 179 180 181
#if REALCASE == 1
      logical, intent(in)                         :: useQR
#endif
      integer(kind=ik)                            :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
Andreas Marek's avatar
Andreas Marek committed
182
                                                    ii, pp
183
      integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
184 185 186
                                                                        &PRECISION&
                                                                        &_&
                                                                        &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
187

188
      call obj%timer%start("bandred_&
Andreas Marek's avatar
Andreas Marek committed
189 190 191 192
      &MATH_DATATYPE&
      &" // &
      &PRECISION_SUFFIX &
      )
193
      if (wantDebug) call obj%timer%start("mpi_communication")
194 195 196 197 198

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
Andreas Marek's avatar
Andreas Marek committed
199

200
      if (wantDebug) call obj%timer%stop("mpi_communication")
201 202 203 204 205 206 207
      success = .true.


      ! Semibandwith nbw must be a multiple of blocksize nblk
      if (mod(nbw,nblk)/=0) then
        if (my_prow==0 .and. my_pcol==0) then
          if (wantDebug) then
Andreas Marek's avatar
Andreas Marek committed
208
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
209 210
      &MATH_DATATYPE&
      &: ERROR: nbw=',nbw,', nblk=',nblk
Andreas Marek's avatar
Andreas Marek committed
211
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
212 213
      &MATH_DATATYPE&
      &: ELPA2 works only for nbw==n*nblk'
214 215 216 217 218 219 220 221 222
          endif
          success = .false.
          return
        endif
      endif

! na_rows in used nowhere; only na_cols
      if (useGPU) then
#ifdef WITH_MPI
223
#if COMPLEXCASE == 1
224
        na_rows = numroc(na, nblk, my_prow, 0, np_rows)
225
#endif
226 227
        na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
228 229 230
#if COMPLEXCASE == 1
         na_rows = na
#endif
231
        na_cols = na
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
232
#endif /* WITH_MPI */
233 234

        ! Here we convert the regular host array into a pinned host array
235
        successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype)
236
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
237
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
238 239
                  &MATH_DATATYPE&
                  &: error in cudaMalloc a_dev 1"
Andreas Marek's avatar
Andreas Marek committed
240
          stop 1
241 242
        endif

243
        successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
244
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
245
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
246 247
                  &MATH_DATATYPE&
                  &: error in cudaMalloc tmat_dev 1"
Andreas Marek's avatar
Andreas Marek committed
248
          stop 1
249 250
        endif

251
        successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
252
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
253
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
254 255
                  &MATH_DATATYPE&
                  &: error in cudaMalloc vav_dev 1"
Andreas Marek's avatar
Andreas Marek committed
256
          stop 1
257
        endif
258 259 260 261 262 263 264 265 266 267
      endif ! useGPU

      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

      l_rows_tile = tile_size/np_rows ! local rows of a tile
      l_cols_tile = tile_size/np_cols ! local cols of a tile

268
#if REALCASE == 1
269 270 271 272
      if (useQR) then

        if (useGPU) then
          print *,"qr decomposition at the moment not supported with GPU"
Andreas Marek's avatar
Andreas Marek committed
273
          stop 1
274 275 276
        endif

        if (which_qr_decomposition == 1) then
277
          call qr_pqrparam_init(obj,pqrparam(1:11),    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
278 279 280
          allocate(tauvector(na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating tauvector "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
281
            stop 1
282 283 284 285 286
          endif

          allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating blockheuristic "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
287
            stop 1
288 289 290 291 292 293
          endif

          l_rows = local_index(na, my_prow, np_rows, nblk, -1)
          allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
294
            stop 1
295 296 297 298 299
          endif

          vmrCols = na

#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
300
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
301 302
               &PRECISION&
               &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
303 304 305 306
                                 nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
307
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
308 309
               &PRECISION&
               &(obj, a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
310 311 312 313 314 315 316 317 318
                                 vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
                                 nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)
#endif

          work_size = dwork_size(1)
          allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating work_blocked "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
319
            stop 1
320
          endif
321
          work_blocked = CONST_0_0
322 323 324
          deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
325
            stop 1
326 327 328 329 330
          endif

        endif ! which_qr_decomposition

      endif ! useQr
Andreas Marek's avatar
Andreas Marek committed
331
#endif /* REALCASE */
332 333 334 335 336

      if (useGPU) then

        cur_l_rows = 0
        cur_l_cols = 0
Andreas Marek's avatar
Andreas Marek committed
337

338
        successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
339
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
340
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
341 342
                  &MATH_DATATYPE&
                  &: error in cudaMemcpy a_dev 2"
Andreas Marek's avatar
Andreas Marek committed
343
          stop 1
344
        endif
345 346 347 348 349 350 351 352 353 354 355
      endif ! useGPU


      do istep = (na-1)/nbw, 1, -1

        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step

        ! Number of local columns/rows of remaining matrix
        l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
        l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)

356 357
        ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces

358 359 360 361 362 363 364 365 366 367 368 369 370
        if (useGPU) then
          cur_l_rows = max(l_rows, 1)
          cur_l_cols = max(l_cols, 1)

          vmr_size = cur_l_rows * 2 * n_cols
          umc_size = cur_l_cols * 2 * n_cols

          ! Allocate vmr and umc only if the inew size exceeds their current capacity
          ! Added for FORTRAN CALLS
          if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
            if (allocated(vr)) then
              deallocate(vr, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
371
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
372 373
                        &MATH_DATATYPE&
                        &: error when deallocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
374
                stop 1
375 376 377 378
              endif
            endif
            allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
            if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
379
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
380 381
                      &MATH_DATATYPE&
                      &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
382
              stop 1
383 384 385 386 387 388 389 390
            endif

          endif

          if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then
            if (allocated(vmrCUDA)) then
              deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
391
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
392 393
                        &MATH_DATATYPE&
                        &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
394
                stop 1
395 396 397 398
              endif

              successCUDA = cuda_free(vmr_dev)
              if (.not.(successCUDA)) then
399
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
400
                        &MATH_DATATYPE&: error in cuda_free vmr_dev 1"
Andreas Marek's avatar
Andreas Marek committed
401
                stop 1
402 403 404 405
              endif
            endif

            allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage)
406

407
            if (istat .ne. 0) then
408
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
409 410
                      &MATH_DATATYPE&
                      &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
411
              stop 1
412
            endif
413
            successCUDA = cuda_malloc(vmr_dev, vmr_size* size_of_datatype)
414
            if (.not.(successCUDA)) then
415
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
416 417
                      &MATH_DATATYPE&
                      &: error in cudaMalloc: vmr_dev2"
Andreas Marek's avatar
Andreas Marek committed
418
              stop 1
419 420 421
            endif

          endif
422 423 424 425 426

          if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then
            if (allocated(umcCUDA)) then
              deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
427
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
428 429
                        &MATH_DATATYPE&
                        &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
430
                stop 1
431 432 433 434
              endif

              successCUDA = cuda_free(umc_dev)
              if (.not.(successCUDA)) then
435
                 print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
436 437
                         &MATH_DATATYPE&
                         &: error in cudaFree umc_dev 1"
Andreas Marek's avatar
Andreas Marek committed
438
                 stop 1
439 440 441
              endif

            endif
442

443
            allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
444

445
            if (istat .ne. 0) then
446
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
447 448
                      &MATH_DATATYPE&
                      &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
449
              stop 1
450 451
            endif

452
            successCUDA = cuda_malloc(umc_dev, umc_size* size_of_datatype)
453
            if (.not.(successCUDA)) then
454
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
455 456
                      &MATH_DATATYPE&
                      &: error in cudaMalloc umc_dev 2"
Andreas Marek's avatar
Andreas Marek committed
457
              stop 1
458
            endif
459

460
          endif
461 462 463

        else ! GPU not used

464 465 466
          ! unify the the name vmr and vmrCPU, as well as vmrGPU
          ! the same for umcCPU and umcGPU
          ! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces
467 468 469

          allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
470
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
471 472
                     &MATH_DATATYPE&
                     &: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
473
            stop 1
474 475
          endif

476 477
          allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
478
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
479 480
                    &MATH_DATATYPE&
                    &: error when allocating umcCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
481
            stop 1
482
          endif
Andreas Marek's avatar
Andreas Marek committed
483

484 485
          allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
486
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
487 488
                    &MATH_DATATYPE&
                    &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
489
            stop 1
490 491 492 493 494
          endif

        endif ! use GPU

        if (useGPU) then
495
#if REALCASE == 1
496
          vmrCUDA(1 : cur_l_rows * n_cols) = CONST_0_0
497 498
#endif
#if COMPLEXCASE == 1
499
          vmrCUDA(1: cur_l_rows * n_cols) = CONST_COMPLEX_0_0
500
#endif
501
        else
502
#if REALCASE == 1
503
          vmrCPU(1:l_rows,1:n_cols) = CONST_0_0
504 505 506 507 508 509 510
#endif
#if COMPLEXCASE == 1
          vmrCPU(1:l_rows,1:n_cols) = CONST_COMPLEX_0_0
#endif
        endif ! useGPU

#if REALCASE == 1
511 512
        vr(:) = CONST_0_0
        tmat(:,:,istep) = CONST_0_0
513 514 515 516 517
#endif
#if COMPLEXCASE == 1
        vr(:) = CONST_COMPLEX_0_0
        tmat(:,:,istep) = CONST_COMPLEX_0_0
#endif
518
        if (useGPU) then
519
#if REALCASE == 1
520
          umcCUDA(1 : umc_size) = CONST_0_0
521
#endif
522 523 524 525
          lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
          lc_end   = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
          lr_end   = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)

526
          if (lc_start .le. 0) lc_start = 1
527 528 529 530

          ! Here we assume that the processor grid and the block grid are aligned
          cur_pcol = pcol(istep*nbw+1, nblk, np_cols)

531 532
          if (my_pcol == cur_pcol) then
            successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), &
Andreas Marek's avatar
Retab  
Andreas Marek committed
533
                                      int((lda*size_of_datatype),kind=c_intptr_t), &
534 535 536 537
                                            (a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )),      &
                                            int(lda*size_of_datatype,kind=c_intptr_t),              &
                                            int(lr_end*size_of_datatype,kind=c_intptr_t),           &
                                             int((lc_end - lc_start+1),kind=c_intptr_t),int(cudaMemcpyDeviceToHost,kind=c_int))
538

539

540
            if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
541
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
542 543
                      &MATH_DATATYPE&
                      &: error in cudaMemcpy2d"
Andreas Marek's avatar
Andreas Marek committed
544
              stop 1
545 546 547 548 549
            endif
          endif
        endif ! useGPU

        ! Reduce current block to lower triangular form
550
#if REALCASE == 1
551 552 553 554
        if (useQR) then
          if (which_qr_decomposition == 1) then
            vmrCols = 2*n_cols
#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
555
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
556 557
                 &PRECISION&
                 &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
558 559 560 561 562 563 564
                                   na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size,        &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
565
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
566 567
                 &PRECISION&
                 &(obj, a(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) ,   &
568 569 570 571 572 573 574 575 576 577
                                    max(l_rows,1), vmrCols, tauvector(1:na), na, &
                                     tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)
#endif
          endif

       else !useQR
578
#endif /* REALCASE == 1 */
579 580
         do lc = n_cols, 1, -1

581
           ncol = istep*nbw + lc ! absolute column number of householder Vector
582 583 584 585 586 587 588 589 590 591 592 593 594
           nrow = ncol - nbw ! Absolute number of pivot row

           lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
           lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number

           tau = 0

           if (nrow == 1) exit ! Nothing to do

           cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block

           if (my_pcol==cur_pcol) then

595
             ! Get Vector to be transformed; distribute last element and norm of
596 597
             ! remaining elements to all procs in current column

598
             vr(1:lr) = a(1:lr,lch) ! Vector to be transformed
599 600 601 602 603 604

             if (my_prow==prow(nrow, nblk, np_rows)) then
               aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
               aux1(2) = vr(lr)
             else
               aux1(1) = dot_product(vr(1:lr),vr(1:lr))
605
#if REALCASE == 1
606
               aux1(2) = CONST_0_0
607 608 609 610
#endif
#if COMPLEXCASE == 1
               aux1(2) = CONST_COMPLEX_0_0
#endif
611 612 613
             endif

#ifdef WITH_MPI
614
             if (wantDebug) call obj%timer%start("mpi_communication")
Andreas Marek's avatar
Andreas Marek committed
615
             call mpi_allreduce(aux1, aux2, 2, &
616
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
617
                                MPI_REAL_PRECISION, &
618 619
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
620
                                MPI_COMPLEX_PRECISION, &
621
#endif
Andreas Marek's avatar
Andreas Marek committed
622
                                MPI_SUM, mpi_comm_rows, mpierr)
623
             if (wantDebug) call obj%timer%stop("mpi_communication")
624 625 626

#else /* WITH_MPI */
              aux2 = aux1 ! this should be optimized
Andreas Marek's avatar
Andreas Marek committed
627
#endif
628 629 630 631 632

             vnorm2 = aux2(1)
             vrl    = aux2(2)

             ! Householder transformation
633
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
634
       call hh_transform_real_&
635 636
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
637
       call hh_transform_complex_&
638
#endif
Andreas Marek's avatar
Andreas Marek committed
639
             &PRECISION &
640
                              (obj, vrl, vnorm2, xf, tau, wantDebug)
641
             ! Scale vr and store Householder Vector for back transformation
642 643 644 645 646

             vr(1:lr) = vr(1:lr) * xf
             if (my_prow==prow(nrow, nblk, np_rows)) then
               a(1:lr-1,lch) = vr(1:lr-1)
               a(lr,lch) = vrl
647
#if REALCASE == 1
648
               vr(lr) = CONST_1_0
649 650 651 652
#endif
#if COMPLEXCASE == 1
               vr(lr) = CONST_COMPLEX_1_0
#endif
653 654 655 656 657 658
             else
               a(1:lr,lch) = vr(1:lr)
             endif

           endif

659
           ! Broadcast Householder Vector and tau along columns
660 661 662

           vr(lr+1) = tau
#ifdef WITH_MPI
663
           if (wantDebug) call obj%timer%start("mpi_communication")
664
           call MPI_Bcast(vr, lr+1, &
665
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
666
                          MPI_REAL_PRECISION, &
667 668
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
669
                          MPI_COMPLEX_PRECISION, &
670
#endif
Andreas Marek's avatar
Andreas Marek committed
671
                          cur_pcol, mpi_comm_cols, mpierr)
672
           if (wantDebug) call obj%timer%stop("mpi_communication")
673 674

#endif /* WITH_MPI */
675

676 677
           if (useGPU) then
             vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
678 679 680
           else
             vmrCPU(1:lr,lc) = vr(1:lr)
           endif
681 682
           tau = vr(lr+1)

683 684 685 686 687 688
#if REALCASE == 1
           tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
#endif
#if COMPLEXCASE == 1
           tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
#endif
689
           ! Transform remaining columns in current block with Householder Vector
690 691
           ! Local dot product

692
#if REALCASE == 1
693
           aux1 = 0
694 695 696 697
#endif
#if COMPLEXCASE == 1
          aux1 = CONST_COMPLEX_0_0
#endif
698 699

#ifdef WITH_OPENMP
700 701 702
#if 0
 ! original complex implementation without openmp. check performance
            nlc = 0 ! number of local columns
703 704 705 706 707 708 709 710 711 712
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
713
           if (wantDebug) call obj%timer%start("mpi_communication")
714 715 716 717 718 719 720 721 722 723
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
Andreas Marek's avatar
Andreas Marek committed
724

725 726 727
             endif
           enddo

728

729
           if (wantDebug) call obj%timer%stop("mpi_communication")
730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754

#else /* WITH_MPI */
!          if (nlc>0) aux2=aux1

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
             endif
           enddo

#endif /* WITH_MPI */
!
!           ! Transform
!
!           nlc = 0
!           do j=1,lc-1
!             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
!             if (lcx>0) then
!               nlc = nlc+1
!               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
755

756 757
!             endif
!           enddo
758
#endif /* if 0 */
759

760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786
           !Open up one omp region to avoid paying openmp overhead.
           !This does not help performance due to the addition of two openmp barriers around the MPI call,
           !But in the future this may be beneficial if these barriers are replaced with a faster implementation

           !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
           mynlc = 0 ! number of local columns

           !This loop does not have independent iterations,
           !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
           !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
           !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
           !the iteration number modulo the number of threads
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0 ) then
               mynlc = mynlc+1
               if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
                   if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
               endif
             endif
           enddo

           ! Get global dot products

           !$omp barrier
           !$omp single
#ifdef WITH_MPI
787
           if (wantDebug) call obj%timer%start("mpi_communication")
788 789
           if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc,     &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
790
                                     MPI_REAL_PRECISION,    &
791 792 793 794
#endif
#if COMPLEXCASE == 1
                                           MPI_COMPLEX_PRECISION, &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
795
                         MPI_SUM, mpi_comm_rows, mpierr)
796
           if (wantDebug) call obj%timer%stop("mpi_communication")
797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825
#else /* WITH_MPI */
           if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
           !$omp end single
           !$omp barrier

           ! Transform
           transformChunkSize=32
           mynlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               mynlc = mynlc+1
               !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
               !However, for some reason this is slower than doing it manually, so it is parallelized as below.
               do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
                  do pp = 1,transformChunkSize
                      if (pp + ii > lr) exit
#if REALCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
#endif
#if COMPLEXCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp)
#endif
                  enddo
               enddo
             endif
           enddo
           !$omp end parallel
826 827 828 829 830 831 832 833 834 835 836 837 838 839

#else /* WITH_OPENMP */

           nlc = 0 ! number of local columns
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
840
           if (wantDebug) call obj%timer%start("mpi_communication")
841 842
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc,      &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
843
                                   MPI_REAL_PRECISION,   &
844 845 846
#endif
#if COMPLEXCASE == 1
                                         MPI_COMPLEX_PRECISION,&
847
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
848
           MPI_SUM, mpi_comm_rows, mpierr)
849
           if (wantDebug) call obj%timer%stop("mpi_communication")
850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868
#else /* WITH_MPI */
           if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
#if REALCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
#endif
#if COMPLEXCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
#endif
             endif
           enddo
#endif /* WITH_OPENMP */
869 870 871 872 873 874
         enddo ! lc

         if (useGPU) then
           ! store column tiles back to GPU
           cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
           if (my_pcol == cur_pcol) then
875
             successCUDA = cuda_memcpy2d((a_dev+        &
876 877 878 879 880
                                         int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)),    &
                                         int(lda*size_of_datatype,kind=c_intptr_t), loc(a(1,lc_start)), &
                                         int(lda*size_of_datatype,kind=c_intptr_t),           &
                                         int(lr_end*size_of_datatype,kind=c_intptr_t),        &
                                         int((lc_end - lc_start+1),kind=c_intptr_t), &
881
                                         int(cudaMemcpyHostToDevice,kind=c_int))
882

883
             if (.not.(successCUDA)) then
884
               print *, "bandred_&
885 886
                        &MATH_DATATYPE&
                        &: cuda memcpy a_dev  failed ", istat
Andreas Marek's avatar
Andreas Marek committed
887
               stop 1
888
             endif
889 890 891 892 893 894 895
           endif
         endif

         ! Calculate scalar products of stored Householder vectors.
         ! This can be done in different ways, we use dsyrk

         vav = 0
896
         call obj%timer%start("blas")
897 898
         if (useGPU) then
           if (l_rows>0) &
Andreas Marek's avatar
Andreas Marek committed
899 900 901 902 903 904
#if REALCASE == 1
             call PRECISION_SYRK('U', 'T',            &
#endif
#if COMPLEXCASE == 1
             call PRECISION_HERK('U', 'C',            &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
905
                           n_cols, l_rows, ONE, &
906 907 908
                           vmrCUDA, cur_l_rows, &
                           ZERO, vav, ubound(vav,dim=1))

909
         else ! useGPU