elpa2_bandred_template.F90 64.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
63 64 65 66
    subroutine bandred_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
67
    (obj, na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
Andreas Marek's avatar
Andreas Marek committed
68
     tmat_dev, wantDebug, useGPU, success &
69
#if REALCASE == 1
70
     , useQR)
71 72
#endif
#if COMPLEXCASE == 1
73
     )
74
#endif
75

76
  !-------------------------------------------------------------------------------
77
  !  bandred_real/complex: Reduces a distributed symmetric matrix to band form
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
  !
  !  Parameters
  !
  !  na          Order of matrix
  !
  !  a(lda,matrixCols)    Distributed matrix which should be reduced.
  !              Distribution is like in Scalapack.
  !              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
  !              a(:,:) is overwritten on exit with the band and the Householder vectors
  !              in the upper half.
  !
  !  lda         Leading dimension of a
  !  matrixCols  local columns of matrix a
  !
  !  nblk        blocksize of cyclic distribution, must be the same in both directions!
  !
  !  nbw         semi bandwith of output matrix
  !
  !  mpi_comm_rows
  !  mpi_comm_cols
  !              MPI-Communicators for rows/columns
  !
  !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
  !              Factors for the Householder vectors (returned), needed for back transformation
  !
  !-------------------------------------------------------------------------------

      use cuda_functions
      use iso_c_binding
      use elpa1_compute
#ifdef WITH_OPENMP
      use omp_lib
#endif
      use precision
112
      use elpa_abstract_impl
113 114
      implicit none

115
      class(elpa_abstract_impl_t), intent(inout) :: obj
116 117 118
      integer(kind=ik)                            :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols

#if REALCASE == 1
119
#ifdef USE_ASSUMED_SIZE
120
      real(kind=REAL_DATATYPE)                    :: a(lda,*), tmat(nbw,nbw,*)
121
#else
122
      real(kind=REAL_DATATYPE)                    :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
123
#endif
124 125 126 127 128 129 130
#endif
#if COMPLEXCASE == 1
#ifdef USE_ASSUMED_SIZE
      complex(kind=COMPLEX_DATATYPE)              :: a(lda,*), tmat(nbw,nbw,*)
#else
      complex(kind=COMPLEX_DATATYPE)              :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
#endif
Andreas Marek's avatar
Andreas Marek committed
131 132 133 134 135 136 137 138 139 140 141 142
#endif /* COMPLEXCASE */

#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
      real(kind=REAL_DATATYPE), parameter         :: ZERO = 0.0_rk8, ONE = 1.0_rk8

#else
      real(kind=REAL_DATATYPE), parameter         :: ZERO = 0.0_rk4, ONE = 1.0_rk4
#endif
#endif

#if COMPLEXCASE == 1
143
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
144
      complex(kind=COMPLEX_DATATYPE), parameter   :: ZERO = (0.0_rk8, 0.0_rk8), ONE = (1.0_rk8, 0.0_rk8)
145
#else
Andreas Marek's avatar
Andreas Marek committed
146
      complex(kind=COMPLEX_DATATYPE), parameter   :: ZERO = (0.0_rk4, 0.0_rk4), ONE = (1.0_rk4, 0.0_rk4)
147 148
#endif
#endif /* COMPLEXCASE == 1 */
149

150 151 152 153
#if REALCASE == 1
      real(kind=REAL_DATATYPE)                    :: eps
#endif
      logical, intent(in)                         :: useGPU
154

155 156 157
      integer(kind=ik)                            :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)                            :: l_cols, l_rows
#if REALCASE == 1
158
      integer(kind=ik)                            :: vmrCols
159
#endif
160
      integer(kind=ik)                            :: mynlc
161 162 163
      integer(kind=ik)                            :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow
      integer(kind=ik)                            :: istep, ncol, lch, lcx, nlc
      integer(kind=ik)                            :: tile_size, l_rows_tile, l_cols_tile
164

165 166 167 168 169 170
      real(kind=REAL_DATATYPE)                    :: vnorm2
#if REALCASE == 1
      real(kind=REAL_DATATYPE)                    :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE)              :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
171
#endif
172

173
#if COMPLEXCASE == 1
174 175
!      complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
      complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:),  vmrCUDA(:),  umcCUDA(:)
176
      complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
Andreas Marek's avatar
Andreas Marek committed
177
      complex(kind=COMPLEX_DATATYPE), allocatable :: vr(:)
178
#endif
179

180 181 182 183 184 185 186 187 188 189 190 191
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable       :: tmpCUDA(:),  vmrCUDA(:),  umcCUDA(:)
      real(kind=REAL_DATATYPE), allocatable       :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
      real(kind=REAL_DATATYPE), allocatable       :: vr(:)
#endif

#if REALCASE == 1
      ! needed for blocked QR decomposition
      integer(kind=ik)                            :: PQRPARAM(11), work_size
      real(kind=REAL_DATATYPE)                    :: dwork_size(1)
      real(kind=REAL_DATATYPE), allocatable       :: work_blocked(:), tauvector(:), blockheuristic(:)
#endif
192
      ! a_dev is passed from bandred_real to trans_ev_band
193
      integer(kind=C_intptr_T)                    :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
194
#ifdef WITH_MPI
195 196 197 198
      integer(kind=ik), external                  :: numroc
#endif
      integer(kind=ik)                            :: ierr
      integer(kind=ik)                            :: cur_l_rows, cur_l_cols, vmr_size, umc_size
199
      integer(kind=c_intptr_t)                      :: lc_start, lc_end
200
#if COMPLEXCASE == 1
201
      integer(kind=c_intptr_t)                      :: lce_1, lcs_1, lre_1
202 203 204 205 206
#endif
      integer(kind=ik)                            :: lr_end
      integer(kind=ik)                            :: na_cols
#if COMPLEXCASE == 1
      integer(kind=ik)                            :: na_rows
207 208
#endif

209 210 211 212 213
      logical, intent(in)                         :: wantDebug
      logical, intent(out)                        :: success
      logical                                     :: successCUDA
      integer(kind=ik)                            :: istat
      character(200)                              :: errorMessage
214

215 216 217 218 219
#if REALCASE == 1
      logical, intent(in)                         :: useQR
#endif
      integer(kind=ik)                            :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
                                                    ii, pp, transformChunkSize
220
      integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
221 222 223
                                                                        &PRECISION&
                                                                        &_&
                                                                        &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
224

225
      call obj%timer%start("bandred_&
Andreas Marek's avatar
Andreas Marek committed
226 227 228 229
      &MATH_DATATYPE&
      &" // &
      &PRECISION_SUFFIX &
      )
230
      if (wantDebug) call obj%timer%start("mpi_communication")
231 232 233 234 235

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
Andreas Marek's avatar
Andreas Marek committed
236

237
      if (wantDebug) call obj%timer%stop("mpi_communication")
238 239 240 241 242 243 244
      success = .true.


      ! Semibandwith nbw must be a multiple of blocksize nblk
      if (mod(nbw,nblk)/=0) then
        if (my_prow==0 .and. my_pcol==0) then
          if (wantDebug) then
Andreas Marek's avatar
Andreas Marek committed
245
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
246 247
      &MATH_DATATYPE&
      &: ERROR: nbw=',nbw,', nblk=',nblk
Andreas Marek's avatar
Andreas Marek committed
248
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
249 250
      &MATH_DATATYPE&
      &: ELPA2 works only for nbw==n*nblk'
251 252 253 254 255 256 257 258 259
          endif
          success = .false.
          return
        endif
      endif

! na_rows in used nowhere; only na_cols
      if (useGPU) then
#ifdef WITH_MPI
260
#if COMPLEXCASE == 1
261
        na_rows = numroc(na, nblk, my_prow, 0, np_rows)
262
#endif
263 264
        na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
265 266 267
#if COMPLEXCASE == 1
         na_rows = na
#endif
268
        na_cols = na
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
269
#endif /* WITH_MPI */
270 271

        ! Here we convert the regular host array into a pinned host array
272
        successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype)
273
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
274
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
275 276
                  &MATH_DATATYPE&
                  &: error in cudaMalloc a_dev 1"
Andreas Marek's avatar
Andreas Marek committed
277
          stop 1
278 279
        endif

280
        successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
281
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
282
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
283 284
                  &MATH_DATATYPE&
                  &: error in cudaMalloc tmat_dev 1"
Andreas Marek's avatar
Andreas Marek committed
285
          stop 1
286 287
        endif

288
        successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
289
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
290
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
291 292
                  &MATH_DATATYPE&
                  &: error in cudaMalloc vav_dev 1"
Andreas Marek's avatar
Andreas Marek committed
293
          stop 1
294
        endif
295 296 297 298 299 300 301 302 303 304
      endif ! useGPU

      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

      l_rows_tile = tile_size/np_rows ! local rows of a tile
      l_cols_tile = tile_size/np_cols ! local cols of a tile

305
#if REALCASE == 1
306 307 308 309
      if (useQR) then

        if (useGPU) then
          print *,"qr decomposition at the moment not supported with GPU"
Andreas Marek's avatar
Andreas Marek committed
310
          stop 1
311 312 313
        endif

        if (which_qr_decomposition == 1) then
314
          call qr_pqrparam_init(obj,pqrparam(1:11),    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
315 316 317
          allocate(tauvector(na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating tauvector "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
318
            stop 1
319 320 321 322 323
          endif

          allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating blockheuristic "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
324
            stop 1
325 326 327 328 329 330
          endif

          l_rows = local_index(na, my_prow, np_rows, nblk, -1)
          allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
331
            stop 1
332 333 334 335 336
          endif

          vmrCols = na

#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
337
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
338 339
               &PRECISION&
               &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
340 341 342 343
                                 nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
344
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
345 346
               &PRECISION&
               &(obj, a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
347 348 349 350 351 352 353 354 355
                                 vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
                                 nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)
#endif

          work_size = dwork_size(1)
          allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating work_blocked "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
356
            stop 1
357
          endif
358
          work_blocked = CONST_0_0
359 360 361
          deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
362
            stop 1
363 364 365 366 367
          endif

        endif ! which_qr_decomposition

      endif ! useQr
Andreas Marek's avatar
Andreas Marek committed
368
#endif /* REALCASE */
369 370 371 372 373

      if (useGPU) then

        cur_l_rows = 0
        cur_l_cols = 0
Andreas Marek's avatar
Andreas Marek committed
374

375
        successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
376
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
377
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
378 379
                  &MATH_DATATYPE&
                  &: error in cudaMemcpy a_dev 2"
Andreas Marek's avatar
Andreas Marek committed
380
          stop 1
381
        endif
382 383 384 385 386 387 388 389 390 391 392
      endif ! useGPU


      do istep = (na-1)/nbw, 1, -1

        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step

        ! Number of local columns/rows of remaining matrix
        l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
        l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)

393 394
        ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces

395 396 397 398 399 400 401 402 403 404 405 406 407
        if (useGPU) then
          cur_l_rows = max(l_rows, 1)
          cur_l_cols = max(l_cols, 1)

          vmr_size = cur_l_rows * 2 * n_cols
          umc_size = cur_l_cols * 2 * n_cols

          ! Allocate vmr and umc only if the inew size exceeds their current capacity
          ! Added for FORTRAN CALLS
          if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
            if (allocated(vr)) then
              deallocate(vr, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
408
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
409 410
                        &MATH_DATATYPE&
                        &: error when deallocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
411
                stop 1
412 413 414 415
              endif
            endif
            allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
            if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
416
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
417 418
                      &MATH_DATATYPE&
                      &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
419
              stop 1
420 421 422 423 424 425 426 427
            endif

          endif

          if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then
            if (allocated(vmrCUDA)) then
              deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
428
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
429 430
                        &MATH_DATATYPE&
                        &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
431
                stop 1
432 433 434 435
              endif

              successCUDA = cuda_free(vmr_dev)
              if (.not.(successCUDA)) then
436
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
437
                        &MATH_DATATYPE&: error in cuda_free vmr_dev 1"
Andreas Marek's avatar
Andreas Marek committed
438
                stop 1
439 440 441 442
              endif
            endif

            allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage)
443

444
            if (istat .ne. 0) then
445
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
446 447
                      &MATH_DATATYPE&
                      &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
448
              stop 1
449
            endif
450
            successCUDA = cuda_malloc(vmr_dev, vmr_size* size_of_datatype)
451
            if (.not.(successCUDA)) then
452
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
453 454
                      &MATH_DATATYPE&
                      &: error in cudaMalloc: vmr_dev2"
Andreas Marek's avatar
Andreas Marek committed
455
              stop 1
456 457 458
            endif

          endif
459 460 461 462 463

          if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then
            if (allocated(umcCUDA)) then
              deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
464
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
465 466
                        &MATH_DATATYPE&
                        &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
467
                stop 1
468 469 470 471
              endif

              successCUDA = cuda_free(umc_dev)
              if (.not.(successCUDA)) then
472
                 print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
473 474
                         &MATH_DATATYPE&
                         &: error in cudaFree umc_dev 1"
Andreas Marek's avatar
Andreas Marek committed
475
                 stop 1
476 477 478
              endif

            endif
479

480
            allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
481

482
            if (istat .ne. 0) then
483
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
484 485
                      &MATH_DATATYPE&
                      &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
486
              stop 1
487 488
            endif

489
            successCUDA = cuda_malloc(umc_dev, umc_size* size_of_datatype)
490
            if (.not.(successCUDA)) then
491
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
492 493
                      &MATH_DATATYPE&
                      &: error in cudaMalloc umc_dev 2"
Andreas Marek's avatar
Andreas Marek committed
494
              stop 1
495
            endif
496

497
          endif
498 499 500

        else ! GPU not used

501 502 503
          ! unify the the name vmr and vmrCPU, as well as vmrGPU
          ! the same for umcCPU and umcGPU
          ! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces
504 505 506

          allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
507
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
508 509
                     &MATH_DATATYPE&
                     &: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
510
            stop 1
511 512
          endif

513 514
          allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
515
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
516 517
                    &MATH_DATATYPE&
                    &: error when allocating umcCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
518
            stop 1
519
          endif
Andreas Marek's avatar
Andreas Marek committed
520

521 522
          allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
523
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
524 525
                    &MATH_DATATYPE&
                    &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
526
            stop 1
527 528 529 530 531
          endif

        endif ! use GPU

        if (useGPU) then
532
#if REALCASE == 1
533
          vmrCUDA(1 : cur_l_rows * n_cols) = CONST_0_0
534 535
#endif
#if COMPLEXCASE == 1
536
          vmrCUDA(1: cur_l_rows * n_cols) = CONST_COMPLEX_0_0
537
#endif
538
        else
539
#if REALCASE == 1
540
          vmrCPU(1:l_rows,1:n_cols) = CONST_0_0
541 542 543 544 545 546 547
#endif
#if COMPLEXCASE == 1
          vmrCPU(1:l_rows,1:n_cols) = CONST_COMPLEX_0_0
#endif
        endif ! useGPU

#if REALCASE == 1
548 549
        vr(:) = CONST_0_0
        tmat(:,:,istep) = CONST_0_0
550 551 552 553 554
#endif
#if COMPLEXCASE == 1
        vr(:) = CONST_COMPLEX_0_0
        tmat(:,:,istep) = CONST_COMPLEX_0_0
#endif
555
        if (useGPU) then
556
#if REALCASE == 1
557
          umcCUDA(1 : umc_size) = CONST_0_0
558
#endif
559 560 561 562
          lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
          lc_end   = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
          lr_end   = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)

563
          if (lc_start .le. 0) lc_start = 1
564 565 566 567

          ! Here we assume that the processor grid and the block grid are aligned
          cur_pcol = pcol(istep*nbw+1, nblk, np_cols)

568 569
          if (my_pcol == cur_pcol) then
            successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), &
Andreas Marek's avatar
Retab  
Andreas Marek committed
570
                                      int((lda*size_of_datatype),kind=c_intptr_t), &
571 572 573 574
                                            (a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )),      &
                                            int(lda*size_of_datatype,kind=c_intptr_t),              &
                                            int(lr_end*size_of_datatype,kind=c_intptr_t),           &
                                             int((lc_end - lc_start+1),kind=c_intptr_t),int(cudaMemcpyDeviceToHost,kind=c_int))
575

576

577
            if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
578
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
579 580
                      &MATH_DATATYPE&
                      &: error in cudaMemcpy2d"
Andreas Marek's avatar
Andreas Marek committed
581
              stop 1
582 583 584 585 586
            endif
          endif
        endif ! useGPU

        ! Reduce current block to lower triangular form
587
#if REALCASE == 1
588 589 590 591
        if (useQR) then
          if (which_qr_decomposition == 1) then
            vmrCols = 2*n_cols
#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
592
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
593 594
                 &PRECISION&
                 &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
595 596 597 598 599 600 601
                                   na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size,        &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
602
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
603 604
                 &PRECISION&
                 &(obj, a(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) ,   &
605 606 607 608 609 610 611 612 613 614
                                    max(l_rows,1), vmrCols, tauvector(1:na), na, &
                                     tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)
#endif
          endif

       else !useQR
615
#endif /* REALCASE == 1 */
616 617
         do lc = n_cols, 1, -1

618
           ncol = istep*nbw + lc ! absolute column number of householder Vector
619 620 621 622 623 624 625 626 627 628 629 630 631
           nrow = ncol - nbw ! Absolute number of pivot row

           lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
           lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number

           tau = 0

           if (nrow == 1) exit ! Nothing to do

           cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block

           if (my_pcol==cur_pcol) then

632
             ! Get Vector to be transformed; distribute last element and norm of
633 634
             ! remaining elements to all procs in current column

635
             vr(1:lr) = a(1:lr,lch) ! Vector to be transformed
636 637 638 639 640 641

             if (my_prow==prow(nrow, nblk, np_rows)) then
               aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
               aux1(2) = vr(lr)
             else
               aux1(1) = dot_product(vr(1:lr),vr(1:lr))
642
#if REALCASE == 1
643
               aux1(2) = CONST_0_0
644 645 646 647
#endif
#if COMPLEXCASE == 1
               aux1(2) = CONST_COMPLEX_0_0
#endif
648 649 650
             endif

#ifdef WITH_MPI
651
             if (wantDebug) call obj%timer%start("mpi_communication")
Andreas Marek's avatar
Andreas Marek committed
652
             call mpi_allreduce(aux1, aux2, 2, &
653
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
654
                                MPI_REAL_PRECISION, &
655 656
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
657
                                MPI_COMPLEX_PRECISION, &
658
#endif
Andreas Marek's avatar
Andreas Marek committed
659
                                MPI_SUM, mpi_comm_rows, mpierr)
660
             if (wantDebug) call obj%timer%stop("mpi_communication")
661 662 663

#else /* WITH_MPI */
              aux2 = aux1 ! this should be optimized
Andreas Marek's avatar
Andreas Marek committed
664
#endif
665 666 667 668 669

             vnorm2 = aux2(1)
             vrl    = aux2(2)

             ! Householder transformation
670
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
671
       call hh_transform_real_&
672 673
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
674
       call hh_transform_complex_&
675
#endif
Andreas Marek's avatar
Andreas Marek committed
676
             &PRECISION &
677
                              (obj, vrl, vnorm2, xf, tau, wantDebug)
678
             ! Scale vr and store Householder Vector for back transformation
679 680 681 682 683

             vr(1:lr) = vr(1:lr) * xf
             if (my_prow==prow(nrow, nblk, np_rows)) then
               a(1:lr-1,lch) = vr(1:lr-1)
               a(lr,lch) = vrl
684
#if REALCASE == 1
685
               vr(lr) = CONST_1_0
686 687 688 689
#endif
#if COMPLEXCASE == 1
               vr(lr) = CONST_COMPLEX_1_0
#endif
690 691 692 693 694 695
             else
               a(1:lr,lch) = vr(1:lr)
             endif

           endif

696
           ! Broadcast Householder Vector and tau along columns
697 698 699

           vr(lr+1) = tau
#ifdef WITH_MPI
700
           if (wantDebug) call obj%timer%start("mpi_communication")
701
           call MPI_Bcast(vr, lr+1, &
702
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
703
                          MPI_REAL_PRECISION, &
704 705
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
706
                          MPI_COMPLEX_PRECISION, &
707
#endif
Andreas Marek's avatar
Andreas Marek committed
708
                          cur_pcol, mpi_comm_cols, mpierr)
709
           if (wantDebug) call obj%timer%stop("mpi_communication")
710 711

#endif /* WITH_MPI */
712

713 714
           if (useGPU) then
             vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
715 716 717
           else
             vmrCPU(1:lr,lc) = vr(1:lr)
           endif
718 719
           tau = vr(lr+1)

720 721 722 723 724 725
#if REALCASE == 1
           tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
#endif
#if COMPLEXCASE == 1
           tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
#endif
726
           ! Transform remaining columns in current block with Householder Vector
727 728
           ! Local dot product

729
#if REALCASE == 1
730
           aux1 = 0
731 732 733 734
#endif
#if COMPLEXCASE == 1
          aux1 = CONST_COMPLEX_0_0
#endif
735 736

#ifdef WITH_OPENMP
737 738 739
#if 0
 ! original complex implementation without openmp. check performance
            nlc = 0 ! number of local columns
740 741 742 743 744 745 746 747 748 749
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
750
           if (wantDebug) call obj%timer%start("mpi_communication")
751 752 753 754 755 756 757 758 759 760
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
Andreas Marek's avatar
Andreas Marek committed
761

762 763 764
             endif
           enddo

765

766
           if (wantDebug) call obj%timer%stop("mpi_communication")
767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791

#else /* WITH_MPI */
!          if (nlc>0) aux2=aux1

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
             endif
           enddo

#endif /* WITH_MPI */
!
!           ! Transform
!
!           nlc = 0
!           do j=1,lc-1
!             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
!             if (lcx>0) then
!               nlc = nlc+1
!               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
792

793 794
!             endif
!           enddo
795
#endif /* if 0 */
796

797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823
           !Open up one omp region to avoid paying openmp overhead.
           !This does not help performance due to the addition of two openmp barriers around the MPI call,
           !But in the future this may be beneficial if these barriers are replaced with a faster implementation

           !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
           mynlc = 0 ! number of local columns

           !This loop does not have independent iterations,
           !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
           !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
           !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
           !the iteration number modulo the number of threads
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0 ) then
               mynlc = mynlc+1
               if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
                   if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
               endif
             endif
           enddo

           ! Get global dot products

           !$omp barrier
           !$omp single
#ifdef WITH_MPI
824
           if (wantDebug) call obj%timer%start("mpi_communication")
825 826
           if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc,     &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
827
                                     MPI_REAL_PRECISION,    &
828 829 830 831
#endif
#if COMPLEXCASE == 1
                                           MPI_COMPLEX_PRECISION, &
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
832
                         MPI_SUM, mpi_comm_rows, mpierr)
833
           if (wantDebug) call obj%timer%stop("mpi_communication")
834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862
#else /* WITH_MPI */
           if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
           !$omp end single
           !$omp barrier

           ! Transform
           transformChunkSize=32
           mynlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               mynlc = mynlc+1
               !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
               !However, for some reason this is slower than doing it manually, so it is parallelized as below.
               do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
                  do pp = 1,transformChunkSize
                      if (pp + ii > lr) exit
#if REALCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
#endif
#if COMPLEXCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp)
#endif
                  enddo
               enddo
             endif
           enddo
           !$omp end parallel
863 864 865 866 867 868 869 870 871 872 873 874 875 876

#else /* WITH_OPENMP */

           nlc = 0 ! number of local columns
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
877
           if (wantDebug) call obj%timer%start("mpi_communication")
878 879
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc,      &
#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
880
                                   MPI_REAL_PRECISION,   &
881 882 883
#endif
#if COMPLEXCASE == 1
                                         MPI_COMPLEX_PRECISION,&
884
#endif
Andreas Marek's avatar
Retab  
Andreas Marek committed
885
           MPI_SUM, mpi_comm_rows, mpierr)
886
           if (wantDebug) call obj%timer%stop("mpi_communication")
887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905
#else /* WITH_MPI */
           if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
#if REALCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
#endif
#if COMPLEXCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
#endif
             endif
           enddo
#endif /* WITH_OPENMP */
906 907 908 909 910 911
         enddo ! lc

         if (useGPU) then
           ! store column tiles back to GPU
           cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
           if (my_pcol == cur_pcol) then
912
             successCUDA = cuda_memcpy2d((a_dev+        &
913 914 915 916 917
                                         int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)),    &
                                         int(lda*size_of_datatype,kind=c_intptr_t), loc(a(1,lc_start)), &
                                         int(lda*size_of_datatype,kind=c_intptr_t),           &
                                         int(lr_end*size_of_datatype,kind=c_intptr_t),        &
                                         int((lc_end - lc_start+1),kind=c_intptr_t), &
918
                                         int(cudaMemcpyHostToDevice,kind=c_int))