elpa2_trans_ev_band_to_full_template.F90 31.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

52
#include "../general/sanity.F90"
53 54 55 56 57

    subroutine trans_ev_band_to_full_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
58
    (obj, na, nqc, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, &
Andreas Marek's avatar
Andreas Marek committed
59
     q_dev, ldq, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, useGPU &
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
#if REALCASE == 1
     ,useQr)
#endif
#if COMPLEXCASE == 1
     )
#endif

    !-------------------------------------------------------------------------------
    !  trans_ev_band_to_full_real/complex:
    !  Transforms the eigenvectors of a band matrix back to the eigenvectors of the original matrix
    !
    !  Parameters
    !
    !  na          Order of matrix a, number of rows of matrix q
    !
    !  nqc         Number of columns of matrix q
    !
    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
    !
    !  nbw         semi bandwith
    !
    !  a(lda,matrixCols)    Matrix containing the Householder vectors (i.e. matrix a after bandred_real/complex)
    !              Distribution is like in Scalapack.
    !
    !  lda         Leading dimension of a
    !  matrixCols  local columns of matrix a and q
    !
    !  tmat(nbw,nbw,numBlocks) Factors returned by bandred_real/complex
    !
    !  q           On input: Eigenvectors of band matrix
    !              On output: Transformed eigenvectors
    !              Distribution is like in Scalapack.
    !
    !  ldq         Leading dimension of q
    !
    !  mpi_comm_rows
    !  mpi_comm_cols
    !              MPI-Communicators for rows/columns
    !
    !-------------------------------------------------------------------------------
      use precision
      use cuda_functions
      use iso_c_binding
103
      use elpa_abstract_impl
104
      implicit none
105
#include "../general/precision_kinds.F90"
106
      class(elpa_abstract_impl_t), intent(inout) :: obj
107 108 109 110 111 112
      logical, intent(in)                    :: useGPU
#if REALCASE == 1
     logical, intent(in)                     :: useQR
#endif
      integer(kind=ik)                       :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
113
      MATH_DATATYPE(kind=rck)               :: a(lda,*), q(ldq,*), tmat(nbw,nbw,*)
114
#else
115
      MATH_DATATYPE(kind=rck)               :: a(lda,matrixCols), q(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
116 117 118 119 120 121 122 123 124
#endif
      integer(kind=C_intptr_T)               :: a_dev ! passed from bandred_real at the moment not used since copied in bandred_real

      integer(kind=ik)                       :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)                       :: max_blocks_row, max_blocks_col, max_local_rows, &
                                                max_local_cols
      integer(kind=ik)                       :: l_cols, l_rows, l_colh, n_cols
      integer(kind=ik)                       :: istep, lc, ncol, nrow, nb, ns

125
      MATH_DATATYPE(kind=rck), allocatable  :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
126 127 128 129 130 131 132 133
      ! hvm_dev is fist used and set in this routine
      ! q is changed in trans_ev_tridi on the host, copied to device and passed here. this can be adapted
      ! tmp_dev is first used in this routine
      ! tmat_dev is passed along from bandred_real
      integer(kind=C_intptr_T)               :: hvm_dev, q_dev, tmp_dev, tmat_dev

      integer(kind=ik)                       :: i

Andreas Marek's avatar
Andreas Marek committed
134
#ifdef BAND_TO_FULL_BLOCKING
135
      MATH_DATATYPE(kind=rck), allocatable  :: tmat_complete(:,:), t_tmp(:,:), t_tmp2(:,:)
136 137
      integer(kind=ik)                       :: cwy_blocking, t_blocking, t_cols, t_rows
#endif
Andreas Marek's avatar
Andreas Marek committed
138

139 140 141
      integer(kind=ik)                       :: istat
      character(200)                         :: errorMessage
      logical                                :: successCUDA
142
      integer(kind=c_intptr_t), parameter      :: size_of_datatype = size_of_&
143 144 145
                                                                   &PRECISION&
                                                                   &_&
                                                                   &MATH_DATATYPE
146
      call obj%timer%start("trans_ev_band_to_full_&
147
      &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
148
      &" // &
149 150
      &PRECISION_SUFFIX &
      )
151
      call obj%timer%start("mpi_communication")
152 153 154 155 156 157

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)

158
      call obj%timer%stop("mpi_communication")
159 160 161 162 163 164 165 166 167 168 169 170 171 172

      max_blocks_row = ((na -1)/nblk)/np_rows + 1  ! Rows of A
      max_blocks_col = ((nqc-1)/nblk)/np_cols + 1  ! Columns of q!

      max_local_rows = max_blocks_row*nblk
      max_local_cols = max_blocks_col*nblk

      if (useGPU) then

#if REALCASE == 1
        ! here the GPU and CPU version diverged: the CPU version now always uses the useQR path which
        ! is not implemented in the GPU version
#endif

Andreas Marek's avatar
Andreas Marek committed
173 174 175
        ! the GPU version does not (yet) support blocking
        ! but the handling is the same for real/complex case

176 177 178
        allocate(tmp1(max_local_cols*nbw), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
179 180
                  &MATH_DATATYPE&
                  &: error when allocating tmp1 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
181
          stop 1
182 183 184 185 186
        endif

        allocate(tmp2(max_local_cols*nbw), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
187 188
                   &MATH_DATATYPE&
                   &: error when allocating tmp2 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
189
          stop 1
190 191 192 193 194
        endif

        allocate(hvb(max_local_rows*nbw), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
195 196
                  &MATH_DATATYPE&
                  &: error when allocating hvb "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
197
          stop 1
198 199 200 201 202
        endif

        allocate(hvm(max_local_rows,nbw), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
203 204
                  &MATH_DATATYPE&
                  &: error when allocating hvm "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
205
          stop 1
206 207
        endif

208
        successCUDA = cuda_malloc(hvm_dev, (max_local_rows)*nbw* size_of_datatype)
209 210
        if (.not.(successCUDA)) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
211 212
                  &MATH_DATATYPE&
                  &: error in cudaMalloc"
Andreas Marek's avatar
Andreas Marek committed
213
          stop 1
214 215
        endif

216
        successCUDA = cuda_malloc(tmp_dev, (max_local_cols)*nbw* size_of_datatype)
217 218
        if (.not.(successCUDA)) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
219 220
                  &MATH_DATATYPE&
                  &: error in cudaMalloc"
Andreas Marek's avatar
Andreas Marek committed
221
          stop 1
222 223 224
        endif

!#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
225 226 227 228
!! it should be possible to keep tmat dev on the device and not copy it around
!! already existent on GPU
!        successCUDA = cuda_malloc(tmat_dev, nbw*nbw* &
!#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
229
!  size_of_PRECISION_real)
Andreas Marek's avatar
Andreas Marek committed
230 231 232 233 234 235 236
!#endif
!#if COMPLEXCASE == 1
!        size_of_PRECISION_complex)
!#endif
!
!        if (.not.(successCUDA)) then
!          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
237 238
!    &MATH_DATATYPE&
!    &: error in cudaMalloc"
Andreas Marek's avatar
Andreas Marek committed
239
!          stop 1
Andreas Marek's avatar
Andreas Marek committed
240
!        endif
241 242 243 244
!#endif

#if REALCASE == 1
! q_dev already living on device
245
!        successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
246 247
!        if (.not.(successCUDA)) then
!          print *,"trans_ev_band_to_full_real: error in cudaMalloc"
Andreas Marek's avatar
Andreas Marek committed
248
!          stop 1
249 250 251 252 253 254 255 256
!        endif
  !      q_temp(:,:) = 0.0
  !      q_temp(1:ldq,1:na_cols) = q(1:ldq,1:na_cols)

!        ! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
!        successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)*size_of_PRECISION_real, cudaMemcpyHostToDevice)
!        if (.not.(successCUDA)) then
!          print *,"trans_ev_band_to_full_real: error in cudaMalloc"
Andreas Marek's avatar
Andreas Marek committed
257
!          stop 1
258 259 260
!        endif
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
261 262 263
!         successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_PRECISION_complex)
!         if (.not.(successCUDA)) then
!           print *,"trans_ev_band_to_full_complex: error in cudaMalloc"
Andreas Marek's avatar
Andreas Marek committed
264
!           stop 1
Andreas Marek's avatar
Andreas Marek committed
265 266 267 268 269
!         endif
!
!         successCUDA = cuda_memcpy(q_dev, loc(q),ldq*matrixCols*size_of_PRECISION_complex, cudaMemcpyHostToDevice)
!          if (.not.(successCUDA)) then
!            print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
Andreas Marek's avatar
Andreas Marek committed
270
!            stop 1
Andreas Marek's avatar
Andreas Marek committed
271
!          endif
272 273 274
#endif

        ! if MPI is NOT used the following steps could be done on the GPU and memory transfers could be avoided
275
        successCUDA = cuda_memset(hvm_dev, 0, (max_local_rows)*(nbw)* size_of_datatype)
276 277
        if (.not.(successCUDA)) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
278 279
                  &MATH_DATATYPE&
                  &: error in cudaMalloc"
Andreas Marek's avatar
Andreas Marek committed
280
          stop 1
281 282
        endif

283 284
        hvm = 0.0_rck   ! Must be set to 0 !!!
        hvb = 0.0_rck   ! Safety only
285 286 287 288 289 290 291 292 293 294 295 296
        l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q

        do istep=1,(na-1)/nbw

          n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step

          ! Broadcast all Householder vectors for current step compressed in hvb

          nb = 0
          ns = 0

          do lc = 1, n_cols
297
            ncol = istep*nbw + lc ! absolute column number of householder Vector
298 299 300 301 302 303 304 305 306 307 308
            nrow = ncol - nbw ! absolute number of pivot row

            l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
            l_colh = local_index(ncol  , my_pcol, np_cols, nblk, -1) ! HV local column number

            if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)

            nb = nb+l_rows

            if (lc==n_cols .or. mod(ncol,nblk)==0) then
#ifdef WITH_MPI
309
              call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
310
              call MPI_Bcast(hvb(ns+1), nb-ns, MPI_MATH_DATATYPE_PRECISION,&
311
                             pcol(ncol, nblk, np_cols), mpi_comm_cols, mpierr)
312

313
              call obj%timer%stop("mpi_communication")
314 315 316 317 318 319 320 321 322 323 324 325 326 327

#endif /* WITH_MPI */
              ns = nb
            endif
          enddo

          ! Expand compressed Householder vectors into matrix hvm

          nb = 0
          do lc = 1, n_cols
            nrow = (istep-1)*nbw+lc ! absolute number of pivot row
            l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast

            hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
328
            if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = 1.0_rck
329 330 331
            nb = nb+l_rows
          enddo

332
          successCUDA = cuda_memcpy(hvm_dev, loc(hvm), max_local_rows*nbw* size_of_datatype, cudaMemcpyHostToDevice)
333 334 335

          if (.not.(successCUDA)) then
            print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
Andreas Marek's avatar
Andreas Marek committed
336
            stop 1
337 338 339 340 341 342 343 344

          endif

          l_rows = local_index(MIN(na,(istep+1)*nbw), my_prow, np_rows, nblk, -1)

          ! Q = Q - V * T**T * V**T * Q

          if (l_rows>0) then
Andreas Marek's avatar
Andreas Marek committed
345
            call obj%timer%start("cublas")
Pavel Kus's avatar
Pavel Kus committed
346
            call cublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',        &
Andreas Marek's avatar
Retab  
Andreas Marek committed
347
                                 n_cols, l_cols, l_rows, ONE, hvm_dev, max_local_rows, &
348
                                       q_dev, ldq , ZERO, tmp_dev, n_cols)
Andreas Marek's avatar
Andreas Marek committed
349
            call obj%timer%stop("cublas")
350 351 352 353 354

#ifdef WITH_MPI

            ! copy data from device to host for a later MPI_ALLREDUCE
            ! copy to host maybe this can be avoided this is needed if MPI is used (allreduce)
355
            successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, l_cols*n_cols*size_of_datatype, cudaMemcpyDeviceToHost)
356 357
            if (.not.(successCUDA)) then
              print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
Andreas Marek's avatar
Andreas Marek committed
358
              stop 1
359 360
            endif

361

362
#else /* WITH_MPI */
363
            ! in real case no copy needed. Don't do it in complex case neither
Andreas Marek's avatar
Andreas Marek committed
364 365
#endif /* WITH_MPI */

366
          else ! l_rows>0
367
            tmp1(1:l_cols*n_cols) = 0.0_rck
368 369 370
          endif ! l_rows>0

#ifdef WITH_MPI
371
          call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
372
          call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, MPI_MATH_DATATYPE_PRECISION, &
373
                             MPI_SUM, mpi_comm_rows, mpierr)
374
          call obj%timer%stop("mpi_communication")
375 376 377 378 379 380 381 382 383

#else /* WITH_MPI */
!          tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols)
#endif /* WITH_MPI */

          if (l_rows>0) then
#ifdef WITH_MPI
            ! after the mpi_allreduce we have to copy back to the device
            ! copy back to device
384
            successCUDA = cuda_memcpy(tmp_dev, loc(tmp2), n_cols*l_cols* size_of_datatype, &
Andreas Marek's avatar
Retab  
Andreas Marek committed
385
              cudaMemcpyHostToDevice)
386 387
            if (.not.(successCUDA)) then
              print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
388 389
                      &MATH_DATATYPE&
                      &: error in cudaMemcpy"
Andreas Marek's avatar
Andreas Marek committed
390
              stop 1
391 392
            endif
#else /* WITH_MPI */
393
            ! in real case no memcopy needed. Don't do it in complex case neither
394 395
#endif /* WITH_MPI */

396
!#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
397 398
           ! IMPORTANT: even though tmat_dev is transfered from the previous rutine, we have to copy from tmat again
           ! tmat is 3-dimensional array, while tmat_dev contains only one 2-dimensional slice of it - and here we
399 400 401 402 403
           ! need to upload another slice
           successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_datatype, cudaMemcpyHostToDevice)

           if (.not.(successCUDA)) then
             print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
404 405
                     &MATH_DATATYPE&
                     &: error in cudaMemcpy"
406 407
             stop 1
           endif
408
!#endif /* WITH_MPI */
409

Andreas Marek's avatar
Andreas Marek committed
410
            call obj%timer%start("cublas")
Pavel Kus's avatar
Pavel Kus committed
411
            call cublas_PRECISION_TRMM('L', 'U', BLAS_TRANS_OR_CONJ, 'N',       &
412 413 414 415
                                        n_cols, l_cols, ONE, tmat_dev, nbw, tmp_dev, n_cols)

            call cublas_PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -ONE, hvm_dev, max_local_rows, &
                                       tmp_dev, n_cols, one, q_dev, ldq)
Andreas Marek's avatar
Andreas Marek committed
416
            call obj%timer%stop("cublas")
417 418 419

            ! copy to host maybe this can be avoided
            ! this is not necessary hvm is not used anymore
420
            successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_datatype),cudaMemcpyDeviceToHost)
421 422
            if (.not.(successCUDA)) then
              print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
Andreas Marek's avatar
Andreas Marek committed
423
              stop 1
424 425 426 427 428 429 430 431
            endif
          endif ! l_rows > 0

        enddo ! istep



      else ! do not useGPU
Andreas Marek's avatar
Andreas Marek committed
432 433

#ifdef BAND_TO_FULL_BLOCKING
434 435 436 437 438 439 440 441 442
        ! t_blocking was formerly 2; 3 is a better choice
        t_blocking = 3 ! number of matrices T (tmat) which are aggregated into a new (larger) T matrix (tmat_complete) and applied at once

        ! we only use the t_blocking if we could call it fully, this is might be better but needs to benchmarked.
!       if ( na >= ((t_blocking+1)*nbw) ) then
        cwy_blocking = t_blocking * nbw

        allocate(tmp1(max_local_cols*cwy_blocking), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
443
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
444 445
                  &MATH_DATATYPE&
                  &: error when allocating tmp1 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
446
          stop 1
447 448 449 450
        endif

        allocate(tmp2(max_local_cols*cwy_blocking), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
451
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
452 453
                  &MATH_DATATYPE&
                  &: error when allocating tmp2 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
454
          stop 1
455 456 457 458
        endif

        allocate(hvb(max_local_rows*cwy_blocking), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
459
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
460 461
                  &MATH_DATATYPE&
                  &: error when allocating hvb "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
462
          stop 1
463 464 465 466
        endif

        allocate(hvm(max_local_rows,cwy_blocking), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
467
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
468 469
                  &MATH_DATATYPE&
                  &: error when allocating hvm "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
470
          stop 1
471 472
        endif

Andreas Marek's avatar
Andreas Marek committed
473 474
#else /* BAND_TO_FULL_BLOCKING */

475 476
        allocate(tmp1(max_local_cols*nbw), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
477
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
478 479
                  &MATH_DATATYPE&
                  &: error when allocating tmp1 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
480
          stop 1
481 482 483 484
        endif

        allocate(tmp2(max_local_cols*nbw), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
485
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
486
                  &MATH_DATATYPE&: error when allocating tmp2 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
487
          stop 1
488 489 490 491
        endif

        allocate(hvb(max_local_rows*nbw), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
492
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
493 494
                  &MATH_DATATYPE&
                  &: error when allocating hvb "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
495
          stop 1
496 497 498 499
        endif

        allocate(hvm(max_local_rows,nbw), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
500
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
501 502
                  &MATH_DATATYPE&
                  &: error when allocating hvm "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
503
          stop 1
504
        endif
Andreas Marek's avatar
Andreas Marek committed
505
#endif /* BAND_TO_FULL_BLOCKING */
506

Andreas Marek's avatar
Andreas Marek committed
507
#ifdef BAND_TO_FULL_BLOCKING
508 509
        allocate(tmat_complete(cwy_blocking,cwy_blocking), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
510
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
511 512
                   &MATH_DATATYPE&
                   &: error when allocating tmat_complete "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
513
          stop 1
514 515 516
        endif
        allocate(t_tmp(cwy_blocking,nbw), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
517
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
518 519
                  &MATH_DATATYPE&
                  &: error when allocating t_tmp "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
520
          stop 1
521 522 523
        endif
        allocate(t_tmp2(cwy_blocking,nbw), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
524
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
525 526
                  &MATH_DATATYPE&
                  &: error when allocating t_tmp2 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
527
          stop 1
528 529 530 531 532 533 534 535 536
        endif
#endif
!        else
!          allocate(tmp1(max_local_cols*nbw))
!          allocate(tmp2(max_local_cols*nbw))
!          allocate(hvb(max_local_rows*nbw))
!          allocate(hvm(max_local_rows,nbw))
!        endif

537 538
        hvm = 0.0_rck   ! Must be set to 0 !!!
        hvb = 0.0_rck   ! Safety only
539 540 541 542
        l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q

!       if ( na >= ((t_blocking+1)*nbw) ) then

Andreas Marek's avatar
Andreas Marek committed
543
#ifdef BAND_TO_FULL_BLOCKING
544
        do istep=1,((na-1)/nbw-1)/t_blocking + 1
Andreas Marek's avatar
Andreas Marek committed
545
#else
546 547 548
        do istep=1,(na-1)/nbw
#endif

Andreas Marek's avatar
Andreas Marek committed
549
#ifdef BAND_TO_FULL_BLOCKING
550
          ! This the call when using  na >= ((t_blocking+1)*nbw)
551 552
          !      n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw
          ! Number of columns in current step
553 554 555 556 557 558 559 560 561
          ! As an alternative we add some special case handling if na < cwy_blocking
          IF (na < cwy_blocking) THEN
            n_cols = MAX(0, na-nbw)
            IF ( n_cols .eq. 0 ) THEN
              EXIT
            END IF
          ELSE
            n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step
          END IF
Andreas Marek's avatar
Andreas Marek committed
562
#else /* BAND_TO_FULL_BLOCKING */
563
          n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
Andreas Marek's avatar
Andreas Marek committed
564
#endif /* BAND_TO_FULL_BLOCKING */
565 566 567 568 569 570
          ! Broadcast all Householder vectors for current step compressed in hvb

          nb = 0
          ns = 0

          do lc = 1, n_cols
Andreas Marek's avatar
Andreas Marek committed
571
#ifdef BAND_TO_FULL_BLOCKING
572
            ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder Vector
Andreas Marek's avatar
Andreas Marek committed
573
#else
574
            ncol = istep*nbw + lc ! absolute column number of householder Vector
575 576 577 578 579 580 581 582 583 584 585 586
#endif
            nrow = ncol - nbw ! absolute number of pivot row

            l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
            l_colh = local_index(ncol  , my_pcol, np_cols, nblk, -1) ! HV local column number

            if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)

            nb = nb+l_rows

            if (lc==n_cols .or. mod(ncol,nblk)==0) then
#ifdef WITH_MPI
587
              call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
588
              call MPI_Bcast(hvb(ns+1), nb-ns, MPI_MATH_DATATYPE_PRECISION,    &
589
                             pcol(ncol, nblk, np_cols), mpi_comm_cols, mpierr)
590

591
              call obj%timer%stop("mpi_communication")
592 593 594 595 596 597 598 599 600 601

#endif /* WITH_MPI */
              ns = nb
            endif
          enddo ! lc

          ! Expand compressed Householder vectors into matrix hvm

          nb = 0
          do lc = 1, n_cols
Andreas Marek's avatar
Andreas Marek committed
602
#ifdef BAND_TO_FULL_BLOCKING
603
            nrow = (istep-1)*cwy_blocking + lc ! absolute number of pivot row
Andreas Marek's avatar
Andreas Marek committed
604
#else
605 606 607 608 609
            nrow = (istep-1)*nbw+lc ! absolute number of pivot row
#endif
            l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast

            hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
610
            if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = 1.0_rck
611 612 613
            nb = nb+l_rows
          enddo

Andreas Marek's avatar
Andreas Marek committed
614
#ifdef BAND_TO_FULL_BLOCKING
615 616 617 618 619 620 621 622 623
          l_rows = local_index(MIN(na,(istep+1)*cwy_blocking), my_prow, np_rows, nblk, -1)

          ! compute tmat2 out of tmat(:,:,)
          tmat_complete = 0
          do i = 1, t_blocking
            t_cols = MIN(nbw, n_cols - (i-1)*nbw)
            if (t_cols <= 0) exit
            t_rows = (i - 1) * nbw
            tmat_complete(t_rows+1:t_rows+t_cols,t_rows+1:t_rows+t_cols) = tmat(1:t_cols,1:t_cols,(istep-1)*t_blocking + i)
Andreas Marek's avatar
Andreas Marek committed
624

625
            if (i > 1) then
Andreas Marek's avatar
Andreas Marek committed
626
              call obj%timer%start("blas")
Pavel Kus's avatar
Pavel Kus committed
627
              call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',      &
Andreas Marek's avatar
Retab  
Andreas Marek committed
628
                           t_rows, t_cols, l_rows, ONE, hvm(1,1), max_local_rows, hvm(1,(i-1)*nbw+1), &
Andreas Marek's avatar
Andreas Marek committed
629 630
                                 max_local_rows, ZERO, t_tmp, cwy_blocking)

Andreas Marek's avatar
Andreas Marek committed
631
              call obj%timer%stop("blas")
632
#ifdef WITH_MPI
633
              call obj%timer%start("mpi_communication")
634

Pavel Kus's avatar
Pavel Kus committed
635
              call mpi_allreduce(t_tmp, t_tmp2, cwy_blocking*nbw, MPI_MATH_DATATYPE_PRECISION,    &
Andreas Marek's avatar
Andreas Marek committed
636
                                 MPI_SUM, mpi_comm_rows, mpierr)
637
              call obj%timer%stop("mpi_communication")
Andreas Marek's avatar
Andreas Marek committed
638
              call obj%timer%start("blas")
Andreas Marek's avatar
Andreas Marek committed
639 640
              call PRECISION_TRMM('L', 'U', 'N', 'N', t_rows, t_cols, ONE, tmat_complete, cwy_blocking, t_tmp2, cwy_blocking)
              call PRECISION_TRMM('R', 'U', 'N', 'N', t_rows, t_cols, -ONE, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, &
641
                         t_tmp2, cwy_blocking)
Andreas Marek's avatar
Andreas Marek committed
642
              call obj%timer%stop("blas")
643 644 645

              tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)

Andreas Marek's avatar
Andreas Marek committed
646
#else /* WITH_MPI */
647
!              t_tmp2(1:cwy_blocking,1:nbw) = t_tmp(1:cwy_blocking,1:nbw)
Andreas Marek's avatar
Andreas Marek committed
648
              call obj%timer%start("blas")
Andreas Marek's avatar
Andreas Marek committed
649 650
              call PRECISION_TRMM('L', 'U', 'N', 'N', t_rows, t_cols, ONE, tmat_complete, cwy_blocking, t_tmp, cwy_blocking)
              call PRECISION_TRMM('R', 'U', 'N', 'N', t_rows, t_cols, -ONE, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, &
651
                         t_tmp, cwy_blocking)
Andreas Marek's avatar
Andreas Marek committed
652
              call obj%timer%stop("blas")
653 654 655

              tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp(1:t_rows,1:t_cols)

Andreas Marek's avatar
Andreas Marek committed
656
#endif /* WITH_MPI */
657 658 659 660 661 662 663 664

!              call PRECISION_TRMM('L', 'U', 'N', 'N', t_rows, t_cols, CONST_1_0, tmat_complete, cwy_blocking, t_tmp2, cwy_blocking)
!              call PRECISION_TRMM('R', 'U', 'N', 'N', t_rows, t_cols, -CONST_1_0, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, &
!                         t_tmp2, cwy_blocking)

!              tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)
             endif
          enddo
Andreas Marek's avatar
Andreas Marek committed
665
#else /* BAND_TO_FULL_BLOCKING */
666 667 668 669 670 671
          l_rows = local_index(MIN(na,(istep+1)*nbw), my_prow, np_rows, nblk, -1)
#endif

          ! Q = Q - V * T**T * V**T * Q

          if (l_rows>0) then
672
            call obj%timer%start("blas")
673

Pavel Kus's avatar
Pavel Kus committed
674
            call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',         &
Andreas Marek's avatar
Retab  
Andreas Marek committed
675
                          n_cols, l_cols, l_rows, ONE, hvm, ubound(hvm,dim=1), &
676
                                q, ldq, ZERO, tmp1, n_cols)
Andreas Marek's avatar
Andreas Marek committed
677
            call obj%timer%stop("blas")
678 679 680

          else ! l_rows>0

681
            tmp1(1:l_cols*n_cols) = 0.0_rck
682 683 684
          endif ! l_rows>0

#ifdef WITH_MPI
685
          call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
686
          call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_rows ,mpierr)
687
          call obj%timer%stop("mpi_communication")
688

Andreas Marek's avatar
Andreas Marek committed
689
          call obj%timer%start("blas")
690 691

          if (l_rows>0) then
Andreas Marek's avatar
Andreas Marek committed
692 693
#ifdef BAND_TO_FULL_BLOCKING

Pavel Kus's avatar
Pavel Kus committed
694
            call PRECISION_TRMM('L', 'U', BLAS_TRANS_OR_CONJ, 'N',        &
Andreas Marek's avatar
Retab  
Andreas Marek committed
695
                          n_cols, l_cols, ONE, tmat_complete, cwy_blocking, tmp2, n_cols)
696
            call PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -ONE, hvm, ubound(hvm,dim=1), tmp2, n_cols, ONE, q, ldq)
Andreas Marek's avatar
Andreas Marek committed
697 698 699

#else /* BAND_TO_FULL_BLOCKING */

Pavel Kus's avatar
Pavel Kus committed
700
            call PRECISION_TRMM('L', 'U', BLAS_TRANS_OR_CONJ, 'N',        &
Andreas Marek's avatar
Andreas Marek committed
701
                                n_cols, l_cols, ONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp2, n_cols)
702 703
            call PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -ONE, hvm, ubound(hvm,dim=1), &
                                tmp2, n_cols, ONE, q, ldq)
Andreas Marek's avatar
Andreas Marek committed
704 705

#endif /* BAND_TO_FULL_BLOCKING */
706 707

          endif
Andreas Marek's avatar
Andreas Marek committed
708
          call obj%timer%stop("blas")
709 710
#else /* WITH_MPI */
!          tmp2 = tmp1
Andreas Marek's avatar
Andreas Marek committed
711
          call obj%timer%start("blas")
712
          if (l_rows>0) then
Andreas Marek's avatar
Andreas Marek committed
713
#ifdef BAND_TO_FULL_BLOCKING
Pavel Kus's avatar
Pavel Kus committed
714
            call PRECISION_TRMM('L', 'U', BLAS_TRANS_OR_CONJ, 'N',        &
Andreas Marek's avatar
Andreas Marek committed
715
                                n_cols, l_cols, ONE, tmat_complete, cwy_blocking, tmp1, n_cols)
716
            call PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -ONE, hvm, ubound(hvm,dim=1), tmp1, n_cols, ONE, q, ldq)
Andreas Marek's avatar
Andreas Marek committed
717 718
#else /* BAND_TO_FULL_BLOCKING */

Pavel Kus's avatar
Pavel Kus committed
719
            call PRECISION_TRMM('L', 'U', BLAS_TRANS_OR_CONJ, 'N',        &
Andreas Marek's avatar
Andreas Marek committed
720
                                n_cols, l_cols, ONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp1, n_cols)
721 722
            call PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -ONE, hvm, ubound(hvm,dim=1), &
                                tmp1, n_cols, ONE, q, ldq)
Andreas Marek's avatar
Andreas Marek committed
723 724

#endif  /* BAND_TO_FULL_BLOCKING */
725
          endif
Andreas Marek's avatar
Andreas Marek committed
726
          call obj%timer%stop("blas")
727 728 729 730 731 732 733 734 735 736 737 738 739 740
#endif /* WITH_MPI */

!          if (l_rows>0) then
!            call PRECISION_TRMM('L', 'U', 'T', 'N', n_cols, l_cols, CONST_1_0, tmat_complete, cwy_blocking, tmp2, n_cols)
!            call PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -CONST_1_0, hvm, ubound(hvm,dim=1), tmp2, n_cols, CONST_1_0, q, ldq)
!          endif

        enddo ! istep

      endif ! useGPU

      deallocate(tmp1, tmp2, hvb, stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
741 742
                 &MATH_DATATYPE&
                 &: error when deallocating tmp1 tmp2 hvb "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
743
        stop 1
744 745 746 747 748 749
      endif

      if (useGPU) then
        successCUDA = cuda_free(hvm_dev)
        if (.not.(successCUDA)) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
750 751
                  &MATH_DATATYPE&
                  &: error in cudaFree"
Andreas Marek's avatar
Andreas Marek committed
752
          stop 1
753 754 755 756 757
        endif

        successCUDA = cuda_free(tmp_dev)
        if (.not.(successCUDA)) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
758 759
                  &MATH_DATATYPE&
                  &: error in cudaFree"
Andreas Marek's avatar
Andreas Marek committed
760
          stop 1
761 762 763 764 765
        endif

        successCUDA = cuda_free(tmat_dev)
        if (.not.(successCUDA)) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
766 767
                  &MATH_DATATYPE&
                  &: error in cudaFree"
Andreas Marek's avatar
Andreas Marek committed
768
          stop 1
769 770 771
        endif

         ! final transfer of q_dev
772
         successCUDA = cuda_memcpy(loc(q), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
773 774 775

         if (.not.(successCUDA)) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
776 777
                  &MATH_DATATYPE&
                  &: error in cudamemcpu q_dev"
Andreas Marek's avatar
Andreas Marek committed
778
          stop 1
779 780 781 782 783 784 785
         endif

         !   q(1:ldq,1:na_cols) = q_temp(1:ldq,1:na_cols)

         successCUDA = cuda_free(q_dev)
         if (.not.(successCUDA)) then
           print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
786 787
                   &MATH_DATATYPE&
                   &: error in cudaFree"
Andreas Marek's avatar
Andreas Marek committed
788
           stop 1
789 790 791 792 793
         endif

         !   deallocate(q_temp, stat=istat, errmsg=errorMessage)
         !   if (istat .ne. 0) then
         !     print *,"error when deallocating q_temp "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
794
         !     stop 1
795 796 797 798
         !   endif
         !   deallocate(tmat_temp, stat=istat, errmsg=errorMessage)
         !   if (istat .ne. 0) then
         !     print *,"trans_ev_band_to_full_real: error when deallocating tmat_temp "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
799
         !     stop 1
800 801 802 803 804 805 806
         !   endif

      endif ! useGPU

      deallocate(hvm, stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
807 808
                &MATH_DATATYPE&
                &: error when deallocating hvm "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
809
        stop 1
810 811
      endif

Andreas Marek's avatar
Andreas Marek committed
812
#if BAND_TO_FULL_BLOCKING
813 814 815 816
      if (.not.(useGPU)) then
        deallocate(tmat_complete, t_tmp, t_tmp2, stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
817 818
                  &MATH_DATATYPE&
                  &: error when deallocating tmat_complete, t_tmp, t_tmp2 "//errorMessage
819 820
          stop 1
        endif
821 822 823
      endif
#endif

824
      call obj%timer%stop("trans_ev_band_to_full_&
825 826 827 828 829 830 831 832 833 834 835
      &MATH_DATATYPE&
      &" // &
      &PRECISION_SUFFIX&
      )

    end subroutine trans_ev_band_to_full_&
    &MATH_DATATYPE&
    &_&
    &PRECISION