elpa1_tridiag_template.X90 42.4 KB
Newer Older
Pavel Kus's avatar
Pavel Kus committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

55 56 57 58 59 60 61

!> \brief Reduces a distributed symmetric matrix to tridiagonal form (like Scalapack Routine PDSYTRD)
!>
!  Parameters
!
!> \param na          Order of matrix
!>
62
!> \param a_mat(lda,matrixCols)    Distributed matrix which should be reduced.
63 64 65 66 67 68 69 70 71 72 73 74 75
!>              Distribution is like in Scalapack.
!>              Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
!>              a(:,:) is overwritten on exit with the Householder vectors
!>
!> \param lda         Leading dimension of a
!>
!> \param nblk        blocksize of cyclic distribution, must be the same in both directions!
!>
!> \param matrixCols  local columns of matrix
!>
!> \param mpi_comm_rows        MPI-Communicator for rows
!> \param mpi_comm_cols        MPI-Communicator for columns
!>
76
!> \param d_vec(na)       Diagonal elements (returned), identical on all processors
77
!>
78
!> \param e_vec(na)       Off-Diagonal elements (returned), identical on all processors
79 80
!>
!> \param tau(na)     Factors for the Householder vectors (returned), needed for back transformation
Pavel Kus's avatar
Pavel Kus committed
81 82
!>
!> \param useGPU      If true,  GPU version of the subroutine will be used
83
!>
84

85 86 87 88 89
    subroutine tridiag_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
    (na, a_mat, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU)
90 91
      use cuda_functions
      use iso_c_binding
Pavel Kus's avatar
Pavel Kus committed
92 93
#ifdef HAVE_DETAILED_TIMINGS
      use timings
94 95
#else
      use timings_dummy
Pavel Kus's avatar
Pavel Kus committed
96 97 98 99
#endif
      use precision
      implicit none

100 101
      logical, parameter      :: new_GPU_multiply = .true.
      
102 103
      integer(kind=ik), intent(in)                  :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
      logical, intent(in)                           :: useGPU
104

105 106 107 108 109 110 111 112 113 114 115 116 117 118
#if REALCASE == 1
      real(kind=REAL_DATATYPE), intent(out)         :: tau(na)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), intent(out)   :: tau(na)
#endif
#if REALCASE == 1
#ifdef USE_ASSUMED_SIZE
      real(kind=REAL_DATATYPE), intent(inout)       :: a_mat(lda,*)
#else
      real(kind=REAL_DATATYPE), intent(inout)       :: a_mat(lda,matrixCols)
#endif
#endif
#if COMPLEXCASE == 1
Pavel Kus's avatar
Pavel Kus committed
119
#ifdef USE_ASSUMED_SIZE
120
      complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,*)
Pavel Kus's avatar
Pavel Kus committed
121
#else
122 123
      complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,matrixCols)
#endif
Pavel Kus's avatar
Pavel Kus committed
124
#endif
125
      real(kind=REAL_DATATYPE), intent(out)         :: d_vec(na), e_vec(na)
Pavel Kus's avatar
Pavel Kus committed
126

127
      integer(kind=ik), parameter                   :: max_stored_uv = 32
Pavel Kus's avatar
Pavel Kus committed
128 129

      ! id in processor row and column and total numbers of processor rows and columns
130 131
      integer(kind=ik)                              :: my_prow, my_pcol, np_rows, np_cols, my_rank
      integer(kind=ik)                              :: mpierr
132 133 134 135 136 137 138
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
      real(kind=rk8), parameter                     :: ZERO=0.0_rk8, ONE = 1.0_rk8
#else
      real(kind=rk4), parameter                     :: ZERO=0.0_rk4, ONE = 1.0_rk4
#endif
#endif
139 140
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
141
      complex(kind=ck8), parameter                  :: ZERO = (0.0_rk8,0.0_rk8), ONE = (1.0_rk8,0.0_rk8)
142
#else
143
      complex(kind=ck4), parameter                  :: ZERO = (0.0_rk4,0.0_rk4), ONE = (1.0_rk4,0.0_rk4)
144 145 146 147
#endif
#endif
      integer(kind=ik)                              :: totalblocks, max_loc_block_rows, max_loc_block_cols, max_local_rows, &
                                                       max_local_cols
148
      ! updated after each istep (in the main cycle) to contain number of
Pavel Kus's avatar
Pavel Kus committed
149
      ! local columns and rows of the remaining part of the matrix
150 151 152
      !integer(kind=ik)                             :: l_cols, l_rows
      integer(kind=ik)                              :: l_cols, l_rows
      integer(kind=ik)                              :: n_stored_vecs
153

154 155 156
      integer(kind=C_intptr_T)                      :: a_dev, v_row_dev, v_col_dev, u_row_dev, u_col_dev, vu_stored_rows_dev, &
                                                       uv_stored_cols_dev
      logical                                       :: successCUDA
157

158 159 160
      integer(kind=ik)                              :: istep, i, j, l_col_beg, l_col_end, l_row_beg, l_row_end
      integer(kind=ik)                              :: tile_size, l_rows_per_tile, l_cols_per_tile
      integer(kind=c_size_t)                        :: a_offset
Pavel Kus's avatar
Pavel Kus committed
161 162

#ifdef WITH_OPENMP
163 164
      integer(kind=ik)                              :: my_thread, n_threads, max_threads, n_iter
      integer(kind=ik)                              :: omp_get_thread_num, omp_get_num_threads, omp_get_max_threads
Pavel Kus's avatar
Pavel Kus committed
165 166
#endif

167 168 169 170 171 172 173
      real(kind=REAL_DATATYPE)                      :: vnorm2
#if REALCASE == 1
      real(kind=REAL_DATATYPE)                      :: vav, x, aux(2*max_stored_uv), aux1(2), aux2(2), vrl, xf
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE)                :: vav, xc, aux(2*max_stored_uv),  aux1(2), aux2(2), aux3(1), vrl, xf
#endif
174

175 176 177 178 179 180 181 182 183 184
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: tmp(:), &
                                                       v_row(:), &   ! used to store calculated Householder vector
                                                       v_col(:), &   ! the same vector, but transposed - differently distributed among MPI tasks
                                                       u_row(:), &
                                                       u_col(:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable   :: tmp(:), v_row(:), v_col(:), u_row(:), u_col(:)
#endif
185 186
      ! the following two matrices store pairs of vectors v and u calculated in each step
      ! at most max_stored_uv vector pairs are stored, than the matrix A_i is explicitli updated
187
      ! u and v are stored both in row and vector forms
188 189
      ! pattern: v1,u1,v2,u2,v3,u3,....
      ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
190 191
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: vu_stored_rows(:,:)
192
      ! pattern: u1,v1,u2,v2,u3,v3,....
193 194 195 196 197
      real(kind=REAL_DATATYPE), allocatable         :: uv_stored_cols(:,:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable   :: vu_stored_rows(:,:), uv_stored_cols(:,:)
#endif
198

Pavel Kus's avatar
Pavel Kus committed
199
#ifdef WITH_OPENMP
200 201 202 203 204 205
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: ur_p(:,:), uc_p(:,:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable   :: ur_p(:,:), uc_p(:,:)
#endif
Pavel Kus's avatar
Pavel Kus committed
206 207
#endif

208 209 210 211 212
#if COMPLEXCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: tmp_real(:)
#endif
      integer(kind=ik)                              :: istat
      character(200)                                :: errorMessage
213 214 215 216
      integer(kind=c_size_t), parameter             :: size_of_datatype = size_of_&
                                                                          &PRECISION&
                                                                          &_&
                                                                          &MATH_DATATYPE
217 218
      call timer%start("tridiag_&
      &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
219
      &" // &
220 221 222
      PRECISION_SUFFIX &
      )

223

Pavel Kus's avatar
Pavel Kus committed
224 225 226 227 228 229
      call timer%start("mpi_communication")
      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
      call timer%stop("mpi_communication")
230

231 232 233 234 235 236 237 238
      if((my_prow == 0) .and. (my_pcol == 0)) then
        if( new_GPU_multiply) then
          write(*,*) "mat vec multiply version 1"
        else
          write(*,*) "mat vec multiply version 0"
        endif
      endif

Pavel Kus's avatar
Pavel Kus committed
239
      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
240 241 242
      ! seems that tile is a square submatrix, consisting by several blocks
      ! it is a smallest possible square submatrix, where blocks being distributed among
      ! processors are "aligned" in both rows and columns
243
      !  -----------------
244 245 246 247 248 249 250 251 252 253 254 255
      ! | 1 4 | 1 4 | 1 4 | ...
      ! | 2 5 | 2 5 | 2 5 | ...
      ! | 3 6 | 3 6 | 3 6 | ...
      !  ----------------- ...
      ! | 1 4 | 1 4 | 1 4 | ...
      ! | 2 5 | 2 5 | 2 5 | ...
      ! | 3 6 | 3 6 | 3 6 | ...
      !  ----------------- .
      !   : :   : :   : :    .
      !   : :   : :   : :      .
      !
      ! this is a tile, where each number represents block, assigned to a processor with the shown number
256
      ! size of this small block is nblk
257
      ! Image is for situation with 6 processors, 3 processor rows and 2 columns
258
      ! tile_size is thus nblk * 6
259
      !
Pavel Kus's avatar
Pavel Kus committed
260 261
      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
262

263 264
      l_rows_per_tile = tile_size/np_rows ! local rows of a tile
      l_cols_per_tile = tile_size/np_cols ! local cols of a tile
265

Pavel Kus's avatar
Pavel Kus committed
266
      totalblocks = (na-1)/nblk + 1
267 268
      max_loc_block_rows = (totalblocks-1)/np_rows + 1
      max_loc_block_cols = (totalblocks-1)/np_cols + 1
Pavel Kus's avatar
Pavel Kus committed
269

270 271 272
      ! localy owned submatrix has size at most max_local_rows x max_local_cols at each processor
      max_local_rows = max_loc_block_rows*nblk
      max_local_cols = max_loc_block_cols*nblk
273

274 275 276 277 278
      ! allocate memmory for vectors
      ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
      ! todo: if something has length max_local_rows, it is actually a column, no?
      ! todo: probably one should read it as v_row = vector v distributed among rows
      !
Pavel Kus's avatar
Pavel Kus committed
279
      allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
280 281
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "tmp", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
282

283
      ! allocate v_row 1 element longer to allow store and broadcast tau together with it
284
      allocate(v_row(max_local_rows+1), stat=istat, errmsg=errorMessage)
285 286
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "v_row", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
287

288
      allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage)
289 290 291
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "u_row", istat, errorMessage)

292
      allocate(v_col(max_local_cols), stat=istat, errmsg=errorMessage)
293 294 295
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "v_col", istat, errorMessage)

296
      allocate(u_col(max_local_cols), stat=istat, errmsg=errorMessage)
297 298
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "u_col", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
299 300 301 302 303

#ifdef WITH_OPENMP
      max_threads = omp_get_max_threads()

      allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
304 305 306
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "ur_p", istat, errorMessage)

Pavel Kus's avatar
Pavel Kus committed
307
      allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
308 309
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "uc_p", istat, errorMessage)
310
#endif /* WITH_OPENMP */
Pavel Kus's avatar
Pavel Kus committed
311 312

      tmp = 0
313 314 315 316
      v_row = 0
      u_row = 0
      v_col = 0
      u_col = 0
Pavel Kus's avatar
Pavel Kus committed
317

318
      allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
319 320 321
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)

322
      allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
323 324 325
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage)

326
      if (useGPU) then
327
         successCUDA = cuda_malloc(v_row_dev, max_local_rows * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
328
         check_alloc_cuda("tridiag: v_row_dev", successCUDA)
329

330 331
         successCUDA = cuda_malloc(u_row_dev, max_local_rows * size_of_datatype)

Andreas Marek's avatar
Andreas Marek committed
332
         check_alloc_cuda("tridiag: u_row_dev", successCUDA)
333

334
         successCUDA = cuda_malloc(v_col_dev, max_local_cols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
335
         check_alloc_cuda("tridiag: v_col_dev", successCUDA)
336

337
         successCUDA = cuda_malloc(u_col_dev, max_local_cols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
338
         check_alloc_cuda("tridiag: u_col_dev", successCUDA)
339

340
         successCUDA = cuda_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
341
         check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
342

343
         successCUDA = cuda_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
344
         check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
345
      endif !useGPU
Pavel Kus's avatar
Pavel Kus committed
346

347

Pavel Kus's avatar
Pavel Kus committed
348 349
      d_vec(:) = 0
      e_vec(:) = 0
Pavel Kus's avatar
Pavel Kus committed
350 351
      tau(:) = 0

352
      n_stored_vecs = 0
Pavel Kus's avatar
Pavel Kus committed
353

Pavel Kus's avatar
Pavel Kus committed
354 355
      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a_mat
      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a_mat
356
      if (my_prow == prow(na, nblk, np_rows) .and. my_pcol == pcol(na, nblk, np_cols)) &
Pavel Kus's avatar
Pavel Kus committed
357
        d_vec(na) = a_mat(l_rows,l_cols)
358

359 360
      if (useGPU) then
        ! allocate memmory for matrix A on the device and than copy the matrix
361

362
        successCUDA = cuda_malloc(a_dev, lda * matrixCols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
363
        check_alloc_cuda("tridiag: a_dev", successCUDA)
364

365
        successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), lda * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
366
        check_alloc_cuda("tridiag: a_dev", successCUDA)
367
      endif
Pavel Kus's avatar
Pavel Kus committed
368

369 370
      ! main cycle of tridiagonalization
      ! in each step, 1 Householder vector is calculated
371
      do istep = na, 3 ,-1
Pavel Kus's avatar
Pavel Kus committed
372

373 374 375 376
        ! Calculate number of local rows and columns of the still remaining matrix
        ! on the local processor
        l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1)
        l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1)
Pavel Kus's avatar
Pavel Kus committed
377

378 379
        ! Calculate vector for Householder transformation on all procs
        ! owning column istep
Pavel Kus's avatar
Pavel Kus committed
380

381
        if (my_pcol == pcol(istep, nblk, np_cols)) then
Pavel Kus's avatar
Pavel Kus committed
382

383 384
          ! Get vector to be transformed; distribute last element and norm of
          ! remaining elements to all procs in current column
Pavel Kus's avatar
Pavel Kus committed
385

386 387
          ! copy l_cols + 1 column of A to v_row
          if (useGPU) then
388
	    a_offset = l_cols * lda * size_of_datatype
389
            ! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
390

391
            successCUDA = cuda_memcpy(loc(v_row(1)), a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
392
            check_memcpy_cuda("tridiag a_dev 1", successCUDA)
393 394 395
          else
            v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
          endif
396

397 398
            if (n_stored_vecs > 0 .and. l_rows > 0) then
	      call timer%start("blas")
399 400 401 402 403 404
#if COMPLEXCASE == 1
              aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
#endif
              call PRECISION_GEMV('N',   &
	                          l_rows, 2*n_stored_vecs,                                  &
                                  ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1),        &
405
#if REALCASE == 1
406
                                  uv_stored_cols(l_cols+1,1), ubound(uv_stored_cols,dim=1), &
407 408
#endif
#if COMPLEXCASE == 1
409 410
                                   aux, 1,  &

411
#endif
412
                                  ONE, v_row, 1)
413 414
	       call timer%stop("blas")

415
            endif
416

417
            if(my_prow == prow(istep-1, nblk, np_rows)) then
418 419
               aux1(1) = dot_product(v_row(1:l_rows-1),v_row(1:l_rows-1))
               aux1(2) = v_row(l_rows)
Pavel Kus's avatar
Pavel Kus committed
420
            else
421
               aux1(1) = dot_product(v_row(1:l_rows),v_row(1:l_rows))
Pavel Kus's avatar
Pavel Kus committed
422 423 424 425 426
               aux1(2) = 0.
            endif

#ifdef WITH_MPI
            call timer%start("mpi_communication")
427 428
            call mpi_allreduce(aux1, aux2, 2, MPI_MATH_DATATYPE_PRECISION, &
                               MPI_SUM, mpi_comm_rows, mpierr)
Pavel Kus's avatar
Pavel Kus committed
429 430
            call timer%stop("mpi_communication")
#else /* WITH_MPI */
431
          aux2 = aux1
Pavel Kus's avatar
Pavel Kus committed
432
#endif /* WITH_MPI */
433 434
          vnorm2 = aux2(1)
          vrl    = aux2(2)
Pavel Kus's avatar
Pavel Kus committed
435

436
          ! Householder transformation
437
#if REALCASE == 1
438
          call hh_transform_real_&
439 440
#endif
#if COMPLEXCASE == 1
441
          call hh_transform_complex_&
442
#endif
Andreas Marek's avatar
cleanup  
Andreas Marek committed
443
               &PRECISION &
444
                             (vrl, vnorm2, xf, tau(istep))
445
          ! Scale v_row and store Householder vector for back transformation
Pavel Kus's avatar
Pavel Kus committed
446

447
          v_row(1:l_rows) = v_row(1:l_rows) * xf
448
          if (my_prow == prow(istep-1, nblk, np_rows)) then
449 450 451 452 453 454 455
            v_row(l_rows) = 1.

            ! vrl is newly computed off-diagonal element of the final tridiagonal matrix
            e_vec(istep-1) = vrl
          endif

          ! store Householder vector for back transformation
456
          a_mat(1:l_rows,l_cols+1) = v_row(1:l_rows)
457

458
          ! add tau after the end of actuall v_row, to be broadcasted with it
459
          v_row(l_rows+1) = tau(istep)
460
         endif !(my_pcol == pcol(istep, nblk, np_cols))
Pavel Kus's avatar
Pavel Kus committed
461 462

#ifdef WITH_MPI
463
         ! Broadcast the Householder vector (and tau) along columns
464
         call MPI_Bcast(v_row, l_rows+1, MPI_MATH_DATATYPE_PRECISION,    &
465
                        pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
Pavel Kus's avatar
Pavel Kus committed
466 467
#endif /* WITH_MPI */

468 469
        !recover tau, which has been broadcasted together with v_row
        tau(istep) =  v_row(l_rows+1)
470

471
        ! Transpose Householder vector v_row -> v_col
472
	call elpa_transpose_vectors_&
Andreas Marek's avatar
cleanup  
Andreas Marek committed
473 474 475
             &MATH_DATATYPE&
             &_&
             &PRECISION &
476 477 478
                                   (v_row, ubound(v_row,dim=1), mpi_comm_rows, v_col, ubound(v_col,dim=1), mpi_comm_cols, &
				    1, istep-1, 1, nblk)

479
        ! Calculate u = (A + VU**T + UV**T)*v
Pavel Kus's avatar
Pavel Kus committed
480

481 482
        ! For cache efficiency, we use only the upper half of the matrix tiles for this,
        ! thus the result is partly in u_col(:) and partly in u_row(:)
Pavel Kus's avatar
Pavel Kus committed
483

484 485
        u_col(1:l_cols) = 0
        u_row(1:l_rows) = 0
486
        if (l_rows > 0 .and. l_cols> 0 ) then
487
          if(useGPU) then
488
            successCUDA = cuda_memset(u_col_dev, 0, l_cols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
489
            check_memcpy_cuda("tridiag: u_col_dev", successCUDA)
490

491
            successCUDA = cuda_memset(u_row_dev, 0, l_rows * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
492
            check_memcpy_cuda("tridiag: u_row_dev", successCUDA)
493

494
            successCUDA = cuda_memcpy(v_col_dev, loc(v_col(1)), l_cols * size_of_datatype, cudaMemcpyHostToDevice)
495

Andreas Marek's avatar
Andreas Marek committed
496
            check_memcpy_cuda("tridiag: v_col_dev", successCUDA)
497

498
            successCUDA = cuda_memcpy(v_row_dev, loc(v_row(1)), l_rows * size_of_datatype, cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
499
            check_memcpy_cuda("tridiag: v_row_dev", successCUDA)
500 501 502
          endif ! useGU

#if REALCASE == 1
503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524
!            if (useGPU) then
             !u_col_dev(1:l_cols) = 0.
             !u_row_dev(1:l_rows) = 0.
             !v_col_dev(1:l_cols) = v_col(1:l_cols)
             !v_row_dev(1:l_rows) = v_row(1:l_rows)
!                do i=0,(istep-2)/tile_size
!                  l_col_beg = i*l_cols_per_tile+1
!                  l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
!                  if(l_col_end<l_col_beg) cycle
!                  do j=0,i
!                     l_row_beg = j*l_rows_per_tile+1
!                     l_row_end = min(l_rows,(j+1)*l_rows_per_tile)
!                     if(l_row_end<l_row_beg) cycle
!                     if(mod(n_iter,n_threads) == my_thread) then
!                       call cublasDGEMV('T',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,1.d0,a_dev(l_row_beg,l_col_beg),lda,v_row_dev(l_row_beg),1,1.d0,u_col_dev(l_col_beg),1)
!                       if(i/=j) call cublasDGEMV('N',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,1.d0,a_dev(l_row_beg,l_col_beg),lda,v_col_dev(l_col_beg),1,1.d0,u_row_dev(l_row_beg),1)
!                     endif
!                     n_iter = n_iter+1
!                  enddo
!                enddo
!
             !--- for now, just use DSYMV!!!
525
             ! a_dev -> a_mat ?
Pavel Kus's avatar
Pavel Kus committed
526
             !write(*,*) "ubound ", ubound(a_mat,1), "lda", lda, "lcols", l_cols
527
!              call PRECISION_SYMV('U', l_cols,  &
Pavel Kus's avatar
Pavel Kus committed
528
!                         1.d0, a_mat, ubound(a_mat,1),  &
529
!                         v_row, 1,  &
530
!                         0.d0, u_col, 1)
531 532
             !u_col(1:l_cols) = u_col_dev(1:l_cols)
             !u_row(1:l_rows) = u_row_dev(1:l_rows)
533

534
!            else !do not use GPU
535 536
#endif

Pavel Kus's avatar
Pavel Kus committed
537
#ifdef WITH_OPENMP
538
          call timer%start("OpenMP parallel")
539
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
Pavel Kus's avatar
Pavel Kus committed
540

541 542
          my_thread = omp_get_thread_num()
          n_threads = omp_get_num_threads()
Pavel Kus's avatar
Pavel Kus committed
543

544
          n_iter = 0
Pavel Kus's avatar
Pavel Kus committed
545

546 547
          uc_p(1:l_cols,my_thread) = 0.
          ur_p(1:l_rows,my_thread) = 0.
548
#endif /* WITH_OPENMP */
549
          do i= 0, (istep-2)/tile_size
550 551
            l_col_beg = i*l_cols_per_tile+1
            l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
552 553
            if (l_col_end < l_col_beg) cycle
            do j = 0, i
554 555
              l_row_beg = j*l_rows_per_tile+1
              l_row_end = min(l_rows,(j+1)*l_rows_per_tile)
556
              if (l_row_end < l_row_beg) cycle
Pavel Kus's avatar
Pavel Kus committed
557
#ifdef WITH_OPENMP
558
              if (mod(n_iter,n_threads) == my_thread) then
559
	        call timer%start("blas")
560 561
                call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
                                    l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
562 563 564
                                    ONE, a_mat(l_row_beg,l_col_beg), lda,         &
                                    v_row(l_row_beg), 1, ONE, uc_p(l_col_beg,my_thread), 1)

565
                if (i/=j) then
566 567 568 569
                  call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1,          &
                                      ONE, a_mat(l_row_beg,l_col_beg), lda, v_col(l_col_beg), 1,  &

                                      ONE, ur_p(l_row_beg,my_thread), 1)
570
                endif
571
	        call timer%stop("blas")
572 573
              endif
              n_iter = n_iter+1
Pavel Kus's avatar
Pavel Kus committed
574
#else /* WITH_OPENMP */
575

576
              if (useGPU) then
577 578 579 580
                if(.not. new_GPU_multiply) then 
                  a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * size_of_datatype
                  call timer%start("cublas")
                  call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ,  &
581 582 583
                                           l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,  &
                                           ONE, a_dev + a_offset, lda,                   &
                                           v_row_dev + (l_row_beg - 1) *                 &
584
                                           size_of_datatype, 1,                          &
585
                                           ONE, u_col_dev + (l_col_beg - 1) *            &
586
                                           size_of_datatype, 1)
587

588 589
                  if(i/=j) then
                    call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,  &
590 591
                                             ONE, a_dev + a_offset, lda,                        &
                                             v_col_dev + (l_col_beg - 1) *                      &
592
                                             size_of_datatype, 1,                          &
593
                                             ONE, u_row_dev + (l_row_beg - 1) *                 &
594
                                             size_of_datatype, 1)
595 596
                  endif
                  call timer%stop("cublas")
597 598
                endif
              else ! useGPU
599 600 601 602 603 604
                call timer%start("blas")
                call PRECISION_GEMV(BLAS_TRANS_OR_CONJ,  &
                            l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
                            ONE, a_mat(l_row_beg, l_col_beg), lda,         &
                            v_row(l_row_beg), 1,                           &
                            ONE, u_col(l_col_beg), 1)
605 606

                if (i/=j) then
607

608
                  call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1,  &
609 610 611
                                      ONE, a_mat(l_row_beg,l_col_beg), lda,               &
                                      v_col(l_col_beg), 1,                                &
                                      ONE, u_row(l_row_beg), 1)
612
                endif
613
	        call timer%stop("blas")
614
              endif ! useGPU
Pavel Kus's avatar
Pavel Kus committed
615 616

#endif /* WITH_OPENMP */
617
            enddo  ! j=0,i
618 619
          enddo  ! i=0,(istep-2)/tile_size

620
          if (useGPU) then
621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660
            if(new_GPU_multiply) then 
            
                do i=0,(istep-2)/tile_size
                    l_col_beg = i*l_cols_per_tile+1
                    l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
                    if(l_col_end<l_col_beg) cycle
                                            
                    l_row_beg = 1
                    l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
                    
                    a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
                           size_of_datatype
                                
                    call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
                               l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
                               ONE, a_dev + a_offset, lda,  &
                               v_row_dev + (l_row_beg - 1) * size_of_datatype, 1,  &
                               ONE, u_col_dev + (l_col_beg - 1) * size_of_datatype, 1)
                enddo
                
                do i=0,(istep-2)/tile_size
                    l_col_beg = i*l_cols_per_tile+1
                    l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
                    if(l_col_end<l_col_beg) cycle
                    
                    l_row_beg = 1
                    l_row_end = min(l_rows,i*l_rows_per_tile)

                    a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
                           size_of_datatype
                           
                    call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, & 
                               ONE, a_dev + a_offset, lda, &
                               v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
                               ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
                enddo

            
            endif          
          
661
            successCUDA = cuda_memcpy(loc(u_col(1)), u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
662
            check_memcpy_cuda("tridiag: u_col_dev 1", successCUDA)
663

664
            successCUDA = cuda_memcpy(loc(u_row(1)), u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
665
            check_memcpy_cuda("tridiag: u_row_dev 1", successCUDA)
666
          endif
667

668
!              call PRECISION_SYMV('U', l_cols,  &
Pavel Kus's avatar
Pavel Kus committed
669
!                         1.d0, a_mat, ubound(a_mat,1),  &
670
!                         v_row, 1,  &
671
!                         0.d0, u_col, 1)
672 673 674

!            endif ! useGPU

Pavel Kus's avatar
Pavel Kus committed
675 676
#ifdef WITH_OPENMP
!$OMP END PARALLEL
677
          call timer%stop("OpenMP parallel")
Pavel Kus's avatar
Pavel Kus committed
678

679 680 681 682 683
          do i=0,max_threads-1
            u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i)
            u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i)
          enddo
#endif /* WITH_OPENMP */
Pavel Kus's avatar
Pavel Kus committed
684

685
          if (n_stored_vecs > 0) then
686
	    call timer%start("blas")
687
#if REALCASE == 1
688
            call PRECISION_GEMV('T',     &
689 690
#endif
#if COMPLEXCASE == 1
691
            call PRECISION_GEMV('C',     &
692
#endif
693 694 695 696 697 698 699
	                        l_rows, 2*n_stored_vecs,   &
                                ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1),   &
                                v_row,  1, ZERO, aux, 1)

            call PRECISION_GEMV('N', l_cols, 2*n_stored_vecs,   &
                                ONE, uv_stored_cols, ubound(uv_stored_cols,dim=1),   &
                                aux, 1, ONE, u_col,  1)
700
	    call timer%stop("blas")
701
          endif
Pavel Kus's avatar
Pavel Kus committed
702

703
        endif  ! (l_rows>0 .and. l_cols>0)
Pavel Kus's avatar
Pavel Kus committed
704

705
        ! Sum up all u_row(:) parts along rows and add them to the u_col(:) parts
Pavel Kus's avatar
Pavel Kus committed
706
        ! on the processors containing the diagonal
707
        ! This is only necessary if u_row has been calculated, i.e. if the
Pavel Kus's avatar
Pavel Kus committed
708 709 710
        ! global tile size is smaller than the global remaining matrix

        if (tile_size < istep-1) then
711 712 713 714 715 716 717 718

	  call elpa_reduce_add_vectors_&
	  &MATH_DATATYPE&
	  &_&
	  &PRECISION &
          (u_row, ubound(u_row,dim=1), mpi_comm_rows, u_col, ubound(u_col,dim=1), &
	   mpi_comm_cols, istep-1, 1, nblk)

Pavel Kus's avatar
Pavel Kus committed
719 720
        endif

721
        ! Sum up all the u_col(:) parts, transpose u_col -> u_row
Pavel Kus's avatar
Pavel Kus committed
722 723

        if (l_cols>0) then
724
          tmp(1:l_cols) = u_col(1:l_cols)
Pavel Kus's avatar
Pavel Kus committed
725 726
#ifdef WITH_MPI
          call timer%start("mpi_communication")
727
          call mpi_allreduce(tmp, u_col, l_cols, MPI_MATH_DATATYPE_PRECISION,    &
728
                             MPI_SUM, mpi_comm_rows, mpierr)
Pavel Kus's avatar
Pavel Kus committed
729 730
          call timer%stop("mpi_communication")
#else /* WITH_MPI */
731
          u_col = tmp
Pavel Kus's avatar
Pavel Kus committed
732 733 734
#endif /* WITH_MPI */
        endif

735 736 737 738 739 740 741
        call elpa_transpose_vectors_&
	&MATH_DATATYPE&
	&_&
	&PRECISION &
        (u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
	 mpi_comm_rows, 1, istep-1, 1, nblk)

Pavel Kus's avatar
Pavel Kus committed
742
        ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
743
#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
744
        x = 0
745
        if (l_cols>0)  &
746
           x = dot_product(v_col(1:l_cols),u_col(1:l_cols))
747 748 749 750 751 752 753
#endif
#if COMPLEXCASE == 1
        xc = 0
        if (l_cols>0)  &
           xc = dot_product(v_col(1:l_cols),u_col(1:l_cols))
#endif

Pavel Kus's avatar
Pavel Kus committed
754 755
#ifdef WITH_MPI
        call timer%start("mpi_communication")
756
#if REALCASE == 1
757
        call mpi_allreduce(x, vav, 1, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
758 759
#endif
#if COMPLEXCASE == 1
760
        call mpi_allreduce(xc, vav, 1 , MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
761
#endif
Pavel Kus's avatar
Pavel Kus committed
762 763
        call timer%stop("mpi_communication")
#else /* WITH_MPI */
764 765

#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
766
        vav = x
767 768 769 770 771
#endif
#if COMPLEXCASE == 1
        vav = xc
#endif

Pavel Kus's avatar
Pavel Kus committed
772 773 774 775 776 777
#endif /* WITH_MPI */

        ! store u and v in the matrices U and V
        ! these matrices are stored combined in one here

        do j=1,l_rows
778
#if REALCASE == 1
779 780
          vu_stored_rows(j,2*n_stored_vecs+1) = tau(istep)*v_row(j)
          vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*tau(istep)*vav*v_row(j) - u_row(j)
781 782 783 784 785
#endif
#if COMPLEXCASE == 1
          vu_stored_rows(j,2*n_stored_vecs+1) = conjg(tau(istep))*v_row(j)
          vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*conjg(tau(istep))*vav*v_row(j) - u_row(j)
#endif
Pavel Kus's avatar
Pavel Kus committed
786 787
        enddo
        do j=1,l_cols
788
#if REALCASE == 1
789 790
          uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*tau(istep)*vav*v_col(j) - u_col(j)
          uv_stored_cols(j,2*n_stored_vecs+2) = tau(istep)*v_col(j)
791 792 793 794 795
#endif
#if COMPLEXCASE == 1
          uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*conjg(tau(istep))*vav*v_col(j) - u_col(j)
          uv_stored_cols(j,2*n_stored_vecs+2) = conjg(tau(istep))*v_col(j)
#endif
Pavel Kus's avatar
Pavel Kus committed
796 797
        enddo

798 799 800 801
        ! We have calculated another Hauseholder vector, number of implicitly stored increased
        n_stored_vecs = n_stored_vecs+1

        ! If the limit of max_stored_uv is reached, calculate A + VU**T + UV**T
802
        if (n_stored_vecs == max_stored_uv .or. istep == 3) then
803 804 805

          if (useGPU) then
            successCUDA = cuda_memcpy(vu_stored_rows_dev, loc(vu_stored_rows(1,1)), &
806
                                      max_local_rows * 2 * max_stored_uv *          &
807
                                      size_of_datatype, cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
808
            check_memcpy_cuda("tridiag: vu_stored_rows_dev", successCUDA)
809

810
            successCUDA = cuda_memcpy(uv_stored_cols_dev, loc(uv_stored_cols(1,1)), &
811
                                      max_local_cols * 2 * max_stored_uv *          &
812
                                      size_of_datatype, cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
813
            check_memcpy_cuda("tridiag: uv_stored_cols_dev", successCUDA)
814
          endif
Pavel Kus's avatar
Pavel Kus committed
815

816
          do i = 0, (istep-2)/tile_size
817 818 819 820 821
            l_col_beg = i*l_cols_per_tile+1
            l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
            l_row_beg = 1
            l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
            if (l_col_end<l_col_beg .or. l_row_end<l_row_beg) &
822
              cycle
823 824

            if (useGPU) then
825 826 827
              call timer%start("cublas")
              call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ,     &
                                         l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs,                      &
828
                                         ONE, vu_stored_rows_dev + (l_row_beg - 1) *                                         &
829
                                         size_of_datatype,  &
830
                                         max_local_rows, uv_stored_cols_dev + (l_col_beg - 1) *                              &
831
                                         size_of_datatype,  &
832
                                         max_local_cols, ONE, a_dev + ((l_row_beg - 1) + (l_col_beg - 1) * lda) *            &
833
                                         size_of_datatype , lda)
834
              call timer%stop("cublas")
835
            else !useGPU
836 837 838
              call timer%start("blas")
              call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ,                &
                                   l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs,    &
839 840 841
                                   ONE, vu_stored_rows(l_row_beg,1), ubound(vu_stored_rows,dim=1),   &
                                   uv_stored_cols(l_col_beg,1), ubound(uv_stored_cols,dim=1),        &
                                   ONE, a_mat(l_row_beg,l_col_beg), lda)
842
              call timer%stop("blas")
843
            endif !useGPU
Pavel Kus's avatar
Pavel Kus committed
844 845
          enddo

846
          n_stored_vecs = 0
Pavel Kus's avatar
Pavel Kus committed
847 848 849

        endif

850
        if (my_prow == prow(istep-1, nblk, np_rows) .and. my_pcol == pcol(istep-1, nblk, np_cols)) then
851
          if (useGPU) then
Pavel Kus's avatar
Pavel Kus committed
852
            !a_mat(l_rows,l_cols) = a_dev(l_rows,l_cols)
853
             a_offset = ((l_rows - 1) + lda * (l_cols - 1)) * size_of_datatype
854

Pavel Kus's avatar
Pavel Kus committed
855
             successCUDA = cuda_memcpy(loc(a_mat(l_rows, l_cols)), a_dev + a_offset, &
856
                                       1 *  size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
857
             check_memcpy_cuda("tridiag: a_dev 3", successCUDA)
858

859
          endif
860
          if (n_stored_vecs > 0) then
Pavel Kus's avatar
Pavel Kus committed
861
            a_mat(l_rows,l_cols) = a_mat(l_rows,l_cols) &
862 863
                        + dot_product(vu_stored_rows(l_rows,1:2*n_stored_vecs),uv_stored_cols(l_cols,1:2*n_stored_vecs))
          end if
Pavel Kus's avatar
Pavel Kus committed
864
          d_vec(istep-1) = a_mat(l_rows,l_cols)
865

866
          if (useGPU) then
Pavel Kus's avatar
Pavel Kus committed
867
            !a_dev(l_rows,l_cols) = a_mat(l_rows,l_cols)
868
            !successCUDA = cuda_threadsynchronize()
Andreas Marek's avatar
Andreas Marek committed
869 870 871
            !check_memcpy_cuda("tridiag: a_dev 4a5a", successCUDA)

             successCUDA = cuda_memcpy(a_dev + a_offset, int(loc(a_mat(l_rows, l_cols)),kind=c_size_t), &
Andreas Marek's avatar
Andreas Marek committed
872
                                       int(1 * size_of_datatype, kind=c_size_t), cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
873
             check_memcpy_cuda("tridiag: a_dev 4", successCUDA)
874
          endif
Pavel Kus's avatar
Pavel Kus committed
875 876 877 878
        endif

      enddo ! main cycle over istep=na,3,-1

879 880 881 882 883 884 885
#if COMPLEXCASE == 1
      ! Store e_vec(1) and d_vec(1)

      if (my_pcol==pcol(2, nblk, np_cols)) then
        if (my_prow==prow(1, nblk, np_rows)) then
          ! We use last l_cols value of loop above
          if(useGPU) then
886 887
            successCUDA = cuda_memcpy(loc(aux3(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
                                    1 * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
888
            check_memcpy_cuda("tridiag: a_dev 5", successCUDA)
889 890 891 892
            vrl = aux3(1)
          else !useGPU
            vrl = a_mat(1,l_cols)
          endif !useGPU
893
          call hh_transform_complex_&
Pavel Kus's avatar
Pavel Kus committed
894 895
                                    &PRECISION &
                                    (vrl, CONST_REAL_0_0, xf, tau(2))
896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916
          e_vec(1) = vrl

          a_mat(1,l_cols) = 1. ! for consistency only
        endif
#ifdef WITH_MPI
        call timer%start("mpi_communication")
        call mpi_bcast(tau(2), 1, MPI_COMPLEX_PRECISION, prow(1, nblk, np_rows), mpi_comm_rows, mpierr)
        call timer%stop("mpi_communication")

#endif /* WITH_MPI */
      endif

#ifdef WITH_MPI
      call timer%start("mpi_communication")
      call mpi_bcast(tau(2), 1, MPI_COMPLEX_PRECISION, pcol(2, nblk, np_cols), mpi_comm_cols, mpierr)
      call timer%stop("mpi_communication")

#endif /* WITH_MPI */
      if (my_prow == prow(1, nblk, np_rows) .and. my_pcol == pcol(1, nblk, np_cols))  then
        if(useGPU) then
          successCUDA = cuda_memcpy(loc(aux3(1)), a_dev, &
917
                                    1 * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
918
          check_memcpy_cuda("tridiag: a_dev 6", successCUDA)
919 920 921 922 923 924 925 926 927
          d_vec(1) = PRECISION_REAL(aux3(1))
        else !useGPU
          d_vec(1) = PRECISION_REAL(a_mat(1,1))
        endif !useGPU
      endif

#endif /* COMPLEXCASE == 1 */

#if REALCASE == 1
928
      ! Store e_vec(1)
929

930
      if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) then
931
        if(useGPU) then
932 933
          successCUDA = cuda_memcpy(loc(e_vec(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
                                    1 * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
934
          check_memcpy_cuda("tridiag: a_dev 7", successCUDA)
935
        else !useGPU
Pavel Kus's avatar
Pavel Kus committed
936
          e_vec(1) = a_mat(1,l_cols) ! use last l_cols value of loop above
937 938
        endif !useGPU
      endif
939

Pavel Kus's avatar
Pavel Kus committed
940
     ! Store d_vec(1)
941
      if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then
942
        if(useGPU) then
943
          successCUDA = cuda_memcpy(loc(d_vec(1)), a_dev, 1 * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
944
          check_memcpy_cuda("tridiag: a_dev 8", successCUDA)
945
        else !useGPU
Pavel Kus's avatar
Pavel Kus committed
946
          d_vec(1) = a_mat(1,1)
947
        endif !useGPU
948
      endif
949
#endif
Pavel Kus's avatar
Pavel Kus committed
950

951
      deallocate(tmp, v_row, u_row, v_col, u_col, vu_stored_rows, uv_stored_cols, stat=istat, errmsg=errorMessage)
Pavel Kus's avatar
Pavel Kus committed
952
      if (istat .ne. 0) then
953
#if REALCASE == 1
954
        print *,"tridiag_real: error when deallocating uv_stored_cols "//errorMessage
955 956 957 958
#endif
#if COMPLEXCASE == 1
        print *,"tridiag_complex: error when deallocating tmp "//errorMessage
#endif