elpa1_tridiag_template.F90 43.9 KB
Newer Older
Pavel Kus's avatar
Pavel Kus committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

55
#include "../general/sanity.F90"
56 57 58 59 60 61 62 63

#undef SAVE_MATR
#ifdef DOUBLE_PRECISION_REAL
#define SAVE_MATR(name, iteration) \
call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,name,iteration)
#else
#define SAVE_MATR(name, iteration)
#endif
64 65 66 67 68

!> \brief Reduces a distributed symmetric matrix to tridiagonal form (like Scalapack Routine PDSYTRD)
!>
!  Parameters
!
69
!> \param obj	      object of elpa_type
70 71
!> \param na          Order of matrix
!>
72
!> \param a_mat(lda,matrixCols)    Distributed matrix which should be reduced.
73 74 75 76 77 78 79 80 81 82 83 84 85
!>              Distribution is like in Scalapack.
!>              Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
!>              a(:,:) is overwritten on exit with the Householder vectors
!>
!> \param lda         Leading dimension of a
!>
!> \param nblk        blocksize of cyclic distribution, must be the same in both directions!
!>
!> \param matrixCols  local columns of matrix
!>
!> \param mpi_comm_rows        MPI-Communicator for rows
!> \param mpi_comm_cols        MPI-Communicator for columns
!>
86
!> \param d_vec(na)       Diagonal elements (returned), identical on all processors
87
!>
88
!> \param e_vec(na)       Off-Diagonal elements (returned), identical on all processors
89 90
!>
!> \param tau(na)     Factors for the Householder vectors (returned), needed for back transformation
Pavel Kus's avatar
Pavel Kus committed
91 92
!>
!> \param useGPU      If true,  GPU version of the subroutine will be used
93
!> \param wantDebug   if true more debug information
94
!>
95 96 97 98
    subroutine tridiag_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
Andreas Marek's avatar
Andreas Marek committed
99
    (obj, na, a_mat, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU, wantDebug, max_threads)
100 101
      use cuda_functions
      use iso_c_binding
Pavel Kus's avatar
Pavel Kus committed
102
      use precision
103
      use elpa_abstract_impl
104
      use matrix_plot
Andreas Marek's avatar
Andreas Marek committed
105 106 107
#ifdef WITH_OPENMP
      use omp_lib
#endif
Pavel Kus's avatar
Pavel Kus committed
108
      implicit none
Pavel Kus's avatar
Pavel Kus committed
109
#include "../general/precision_kinds.F90"
110
      class(elpa_abstract_impl_t), intent(inout) :: obj
111
      integer(kind=ik), intent(in)                  :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
112
      logical, intent(in)                           :: useGPU, wantDebug
113

114 115 116 117 118 119 120 121 122 123 124 125 126 127
#if REALCASE == 1
      real(kind=REAL_DATATYPE), intent(out)         :: tau(na)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), intent(out)   :: tau(na)
#endif
#if REALCASE == 1
#ifdef USE_ASSUMED_SIZE
      real(kind=REAL_DATATYPE), intent(inout)       :: a_mat(lda,*)
#else
      real(kind=REAL_DATATYPE), intent(inout)       :: a_mat(lda,matrixCols)
#endif
#endif
#if COMPLEXCASE == 1
Pavel Kus's avatar
Pavel Kus committed
128
#ifdef USE_ASSUMED_SIZE
129
      complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,*)
Pavel Kus's avatar
Pavel Kus committed
130
#else
131 132
      complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,matrixCols)
#endif
Pavel Kus's avatar
Pavel Kus committed
133
#endif
134
      real(kind=REAL_DATATYPE), intent(out)         :: d_vec(na), e_vec(na)
Pavel Kus's avatar
Pavel Kus committed
135

136
      integer(kind=ik), parameter                   :: max_stored_uv = 32
137
      logical,          parameter                   :: mat_vec_as_one_block = .true.
Pavel Kus's avatar
Pavel Kus committed
138 139

      ! id in processor row and column and total numbers of processor rows and columns
Andreas Marek's avatar
Andreas Marek committed
140
      integer(kind=ik)                              :: my_prow, my_pcol, np_rows, np_cols
141 142 143
      integer(kind=ik)                              :: mpierr
      integer(kind=ik)                              :: totalblocks, max_loc_block_rows, max_loc_block_cols, max_local_rows, &
                                                       max_local_cols
144
      ! updated after each istep (in the main cycle) to contain number of
Pavel Kus's avatar
Pavel Kus committed
145
      ! local columns and rows of the remaining part of the matrix
146 147 148
      !integer(kind=ik)                             :: l_cols, l_rows
      integer(kind=ik)                              :: l_cols, l_rows
      integer(kind=ik)                              :: n_stored_vecs
149

150 151 152
      integer(kind=C_intptr_T)                      :: a_dev, v_row_dev, v_col_dev, u_row_dev, u_col_dev, vu_stored_rows_dev, &
                                                       uv_stored_cols_dev
      logical                                       :: successCUDA
153

154 155
      integer(kind=ik)                              :: istep, i, j, l_col_beg, l_col_end, l_row_beg, l_row_end
      integer(kind=ik)                              :: tile_size, l_rows_per_tile, l_cols_per_tile
Andreas Marek's avatar
Andreas Marek committed
156
      integer(kind=c_intptr_t)                      :: a_offset
Pavel Kus's avatar
Pavel Kus committed
157

Andreas Marek's avatar
Andreas Marek committed
158
      integer(kind=ik), intent(in)                  :: max_threads
Pavel Kus's avatar
Pavel Kus committed
159
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
160
      integer(kind=ik)                              :: my_thread, n_threads, n_iter
Pavel Kus's avatar
Pavel Kus committed
161 162
#endif

163 164 165 166 167 168 169
      real(kind=REAL_DATATYPE)                      :: vnorm2
#if REALCASE == 1
      real(kind=REAL_DATATYPE)                      :: vav, x, aux(2*max_stored_uv), aux1(2), aux2(2), vrl, xf
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE)                :: vav, xc, aux(2*max_stored_uv),  aux1(2), aux2(2), aux3(1), vrl, xf
#endif
170

171 172
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: tmp(:), &
173 174
                                                       v_row(:), &   ! used to store calculated Householder Vector
                                                       v_col(:), &   ! the same Vector, but transposed - differently distributed among MPI tasks
175 176 177 178 179 180
                                                       u_row(:), &
                                                       u_col(:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable   :: tmp(:), v_row(:), v_col(:), u_row(:), u_col(:)
#endif
181
      ! the following two matrices store pairs of vectors v and u calculated in each step
182 183
      ! at most max_stored_uv Vector pairs are stored, than the matrix A_i is explicitli updated
      ! u and v are stored both in row and Vector forms
184 185
      ! pattern: v1,u1,v2,u2,v3,u3,....
      ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
186 187
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: vu_stored_rows(:,:)
188
      ! pattern: u1,v1,u2,v2,u3,v3,....
189 190 191 192 193
      real(kind=REAL_DATATYPE), allocatable         :: uv_stored_cols(:,:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable   :: vu_stored_rows(:,:), uv_stored_cols(:,:)
#endif
194

Pavel Kus's avatar
Pavel Kus committed
195
#ifdef WITH_OPENMP
196 197 198 199 200 201
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: ur_p(:,:), uc_p(:,:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable   :: ur_p(:,:), uc_p(:,:)
#endif
Pavel Kus's avatar
Pavel Kus committed
202 203
#endif

204 205 206
#if COMPLEXCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: tmp_real(:)
#endif
207
      integer(kind=ik)                              :: min_tile_size, error
208 209
      integer(kind=ik)                              :: istat
      character(200)                                :: errorMessage
210
      character(20)                                 :: gpuString
211
      integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
212 213 214
                                                                          &PRECISION&
                                                                          &_&
                                                                          &MATH_DATATYPE
215 216 217 218 219 220
      if(useGPU) then
        gpuString = "_gpu"
      else
        gpuString = ""
      endif

221
      call obj%timer%start("tridiag_&
222
      &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
223
      &" // &
224 225
      PRECISION_SUFFIX // &
      gpuString )
226

227

228
      if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
229 230 231 232
      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
233
      if (wantDebug) call obj%timer%stop("mpi_communication")
234

Pavel Kus's avatar
Pavel Kus committed
235
      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
236 237 238
      ! seems that tile is a square submatrix, consisting by several blocks
      ! it is a smallest possible square submatrix, where blocks being distributed among
      ! processors are "aligned" in both rows and columns
239
      !  -----------------
240 241 242 243 244 245 246 247 248 249 250 251
      ! | 1 4 | 1 4 | 1 4 | ...
      ! | 2 5 | 2 5 | 2 5 | ...
      ! | 3 6 | 3 6 | 3 6 | ...
      !  ----------------- ...
      ! | 1 4 | 1 4 | 1 4 | ...
      ! | 2 5 | 2 5 | 2 5 | ...
      ! | 3 6 | 3 6 | 3 6 | ...
      !  ----------------- .
      !   : :   : :   : :    .
      !   : :   : :   : :      .
      !
      ! this is a tile, where each number represents block, assigned to a processor with the shown number
252
      ! size of this small block is nblk
253
      ! Image is for situation with 6 processors, 3 processor rows and 2 columns
254
      ! tile_size is thus nblk * 6
255
      !
Pavel Kus's avatar
Pavel Kus committed
256
      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
257 258 259 260 261 262 263 264 265 266 267 268 269 270 271

      ! make tile_size a smallest possible multiple of previously defined tile size, such that it is
      ! larger or equal to min_tile_size
      ! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value
      ! it can, however, be set by the user
      call obj%get("min_tile_size", min_tile_size ,error)
      if (error .ne. ELPA_OK) then
        print *,"Problem setting option. Aborting..."
        stop
      endif
      if(min_tile_size == 0) then
        ! not set by the user, use the default value
        min_tile_size = 128*max(np_rows, np_cols)
      endif
      tile_size = ((min_tile_size-1)/tile_size+1)*tile_size
272

273 274
      l_rows_per_tile = tile_size/np_rows ! local rows of a tile
      l_cols_per_tile = tile_size/np_cols ! local cols of a tile
275

Pavel Kus's avatar
Pavel Kus committed
276
      totalblocks = (na-1)/nblk + 1
277 278
      max_loc_block_rows = (totalblocks-1)/np_rows + 1
      max_loc_block_cols = (totalblocks-1)/np_cols + 1
Pavel Kus's avatar
Pavel Kus committed
279

280 281 282
      ! localy owned submatrix has size at most max_local_rows x max_local_cols at each processor
      max_local_rows = max_loc_block_rows*nblk
      max_local_cols = max_loc_block_cols*nblk
283

284 285 286
      ! allocate memmory for vectors
      ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
      ! todo: if something has length max_local_rows, it is actually a column, no?
287
      ! todo: probably one should read it as v_row = Vector v distributed among rows
288
      !
Pavel Kus's avatar
Pavel Kus committed
289
      allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
290 291
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "tmp", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
292

293
      ! allocate v_row 1 element longer to allow store and broadcast tau together with it
294
      allocate(v_row(max_local_rows+1), stat=istat, errmsg=errorMessage)
295 296
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "v_row", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
297

298
      allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage)
299 300 301
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "u_row", istat, errorMessage)

302
      allocate(v_col(max_local_cols), stat=istat, errmsg=errorMessage)
303 304 305
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "v_col", istat, errorMessage)

306
      allocate(u_col(max_local_cols), stat=istat, errmsg=errorMessage)
307 308
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "u_col", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
309 310 311

#ifdef WITH_OPENMP
      allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
312 313 314
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "ur_p", istat, errorMessage)

Pavel Kus's avatar
Pavel Kus committed
315
      allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
316 317
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "uc_p", istat, errorMessage)
318
#endif /* WITH_OPENMP */
Pavel Kus's avatar
Pavel Kus committed
319 320

      tmp = 0
321 322 323 324
      v_row = 0
      u_row = 0
      v_col = 0
      u_col = 0
Pavel Kus's avatar
Pavel Kus committed
325

326
      allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
327 328 329
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)

330
      allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
331 332 333
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage)

334
      if (useGPU) then
335
         successCUDA = cuda_malloc(v_row_dev, max_local_rows * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
336
         check_alloc_cuda("tridiag: v_row_dev", successCUDA)
337

338 339
         successCUDA = cuda_malloc(u_row_dev, max_local_rows * size_of_datatype)

Andreas Marek's avatar
Andreas Marek committed
340
         check_alloc_cuda("tridiag: u_row_dev", successCUDA)
341

342
         successCUDA = cuda_malloc(v_col_dev, max_local_cols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
343
         check_alloc_cuda("tridiag: v_col_dev", successCUDA)
344

345
         successCUDA = cuda_malloc(u_col_dev, max_local_cols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
346
         check_alloc_cuda("tridiag: u_col_dev", successCUDA)
347

348
         successCUDA = cuda_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
349
         check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
350

351
         successCUDA = cuda_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
352
         check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
353
      endif !useGPU
Pavel Kus's avatar
Pavel Kus committed
354

355

Pavel Kus's avatar
Pavel Kus committed
356 357
      d_vec(:) = 0
      e_vec(:) = 0
Pavel Kus's avatar
Pavel Kus committed
358 359
      tau(:) = 0

360
      n_stored_vecs = 0
Pavel Kus's avatar
Pavel Kus committed
361

Pavel Kus's avatar
Pavel Kus committed
362 363
      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a_mat
      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a_mat
364

365
      if (my_prow == prow(na, nblk, np_rows) .and. my_pcol == pcol(na, nblk, np_cols)) &
366 367 368 369
#if COMPLEXCASE == 1
        d_vec(na) = real(a_mat(l_rows,l_cols), kind=rk)
#endif
#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
370
        d_vec(na) = a_mat(l_rows,l_cols)
371
#endif
372

373 374
      if (useGPU) then
        ! allocate memmory for matrix A on the device and than copy the matrix
375

376
        successCUDA = cuda_malloc(a_dev, lda * matrixCols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
377
        check_alloc_cuda("tridiag: a_dev", successCUDA)
378

379
        successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), lda * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
Pavel Kus's avatar
Pavel Kus committed
380
        check_memcpy_cuda("tridiag: a_dev", successCUDA)
381
      endif
Pavel Kus's avatar
Pavel Kus committed
382

383
      ! main cycle of tridiagonalization
384
      ! in each step, 1 Householder Vector is calculated
385
      do istep = na, 3 ,-1
Pavel Kus's avatar
Pavel Kus committed
386

387 388 389 390
        ! Calculate number of local rows and columns of the still remaining matrix
        ! on the local processor
        l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1)
        l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1)
Pavel Kus's avatar
Pavel Kus committed
391

392
        ! Calculate Vector for Householder transformation on all procs
393
        ! owning column istep
Pavel Kus's avatar
Pavel Kus committed
394

395
        if (my_pcol == pcol(istep, nblk, np_cols)) then
Pavel Kus's avatar
Pavel Kus committed
396

397
          ! Get Vector to be transformed; distribute last element and norm of
398
          ! remaining elements to all procs in current column
Pavel Kus's avatar
Pavel Kus committed
399

400 401
          ! copy l_cols + 1 column of A to v_row
          if (useGPU) then
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
402
            a_offset = l_cols * lda * size_of_datatype
403
            ! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
404

405
            successCUDA = cuda_memcpy(loc(v_row(1)), a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
406
            check_memcpy_cuda("tridiag a_dev 1", successCUDA)
407 408 409
          else
            v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
          endif
410

411
            if (n_stored_vecs > 0 .and. l_rows > 0) then
412
              if (wantDebug) call obj%timer%start("blas")
413 414 415 416
#if COMPLEXCASE == 1
              aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
#endif
              call PRECISION_GEMV('N',   &
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
417
                                  l_rows, 2*n_stored_vecs,                                  &
418
                                  ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1),        &
419
#if REALCASE == 1
420
                                  uv_stored_cols(l_cols+1,1), ubound(uv_stored_cols,dim=1), &
421 422
#endif
#if COMPLEXCASE == 1
423 424
                                   aux, 1,  &

425
#endif
426
                                  ONE, v_row, 1)
427
               if (wantDebug) call obj%timer%stop("blas")
428

429
            endif
430

431
            if(my_prow == prow(istep-1, nblk, np_rows)) then
432 433
               aux1(1) = dot_product(v_row(1:l_rows-1),v_row(1:l_rows-1))
               aux1(2) = v_row(l_rows)
Pavel Kus's avatar
Pavel Kus committed
434
            else
435
               aux1(1) = dot_product(v_row(1:l_rows),v_row(1:l_rows))
Pavel Kus's avatar
Pavel Kus committed
436 437 438 439
               aux1(2) = 0.
            endif

#ifdef WITH_MPI
440
            if (wantDebug) call obj%timer%start("mpi_communication")
441 442
            call mpi_allreduce(aux1, aux2, 2, MPI_MATH_DATATYPE_PRECISION, &
                               MPI_SUM, mpi_comm_rows, mpierr)
443
            if (wantDebug) call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
444
#else /* WITH_MPI */
445
          aux2 = aux1
Pavel Kus's avatar
Pavel Kus committed
446
#endif /* WITH_MPI */
447 448

#if REALCASE == 1
449
          vnorm2 = aux2(1)
450 451 452 453
#endif
#if COMPLEXCASE == 1
          vnorm2 = real(aux2(1),kind=rk)
#endif
454
          vrl    = aux2(2)
Pavel Kus's avatar
Pavel Kus committed
455

456
          ! Householder transformation
457
#if REALCASE == 1
458
          call hh_transform_real_&
459 460
#endif
#if COMPLEXCASE == 1
461
          call hh_transform_complex_&
462
#endif
Andreas Marek's avatar
cleanup  
Andreas Marek committed
463
               &PRECISION &
464
                             (obj, vrl, vnorm2, xf, tau(istep), wantDebug)
465
          ! Scale v_row and store Householder Vector for back transformation
Pavel Kus's avatar
Pavel Kus committed
466

467
          v_row(1:l_rows) = v_row(1:l_rows) * xf
468
          if (my_prow == prow(istep-1, nblk, np_rows)) then
469 470 471
            v_row(l_rows) = 1.

            ! vrl is newly computed off-diagonal element of the final tridiagonal matrix
472
#if REALCASE == 1
473
            e_vec(istep-1) = vrl
474 475 476 477
#endif
#if COMPLEXCASE == 1
            e_vec(istep-1) = real(vrl,kind=rk)
#endif
478 479
          endif

480
          ! store Householder Vector for back transformation
481
          a_mat(1:l_rows,l_cols+1) = v_row(1:l_rows)
482

483
          ! add tau after the end of actuall v_row, to be broadcasted with it
484
          v_row(l_rows+1) = tau(istep)
485
         endif !(my_pcol == pcol(istep, nblk, np_cols))
486

487
!          SAVE_MATR("HH vec stored", na - istep + 1)
Pavel Kus's avatar
Pavel Kus committed
488 489

#ifdef WITH_MPI
490
         if (wantDebug) call obj%timer%start("mpi_communication")
491
         ! Broadcast the Householder Vector (and tau) along columns
492
         call MPI_Bcast(v_row, l_rows+1, MPI_MATH_DATATYPE_PRECISION,    &
493
                        pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
494
         if (wantDebug) call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
495 496
#endif /* WITH_MPI */

497 498
        !recover tau, which has been broadcasted together with v_row
        tau(istep) =  v_row(l_rows+1)
499

500
        ! Transpose Householder Vector v_row -> v_col
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
501
        call elpa_transpose_vectors_&
Andreas Marek's avatar
cleanup  
Andreas Marek committed
502 503 504
             &MATH_DATATYPE&
             &_&
             &PRECISION &
505
                   (obj, v_row, ubound(v_row,dim=1), mpi_comm_rows, v_col, ubound(v_col,dim=1), mpi_comm_cols, &
Andreas Marek's avatar
Andreas Marek committed
506
                    1, istep-1, 1, nblk, max_threads)
507

508
        ! Calculate u = (A + VU**T + UV**T)*v
Pavel Kus's avatar
Pavel Kus committed
509

510 511
        ! For cache efficiency, we use only the upper half of the matrix tiles for this,
        ! thus the result is partly in u_col(:) and partly in u_row(:)
Pavel Kus's avatar
Pavel Kus committed
512

513 514
        u_col(1:l_cols) = 0
        u_row(1:l_rows) = 0
515
        if (l_rows > 0 .and. l_cols> 0 ) then
516
          if(useGPU) then
517
            successCUDA = cuda_memset(u_col_dev, 0, l_cols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
518
            check_memcpy_cuda("tridiag: u_col_dev", successCUDA)
519

520
            successCUDA = cuda_memset(u_row_dev, 0, l_rows * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
521
            check_memcpy_cuda("tridiag: u_row_dev", successCUDA)
522

523
            successCUDA = cuda_memcpy(v_col_dev, loc(v_col(1)), l_cols * size_of_datatype, cudaMemcpyHostToDevice)
524

Andreas Marek's avatar
Andreas Marek committed
525
            check_memcpy_cuda("tridiag: v_col_dev", successCUDA)
526

527
            successCUDA = cuda_memcpy(v_row_dev, loc(v_row(1)), l_rows * size_of_datatype, cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
528
            check_memcpy_cuda("tridiag: v_row_dev", successCUDA)
529 530
          endif ! useGU

Pavel Kus's avatar
Pavel Kus committed
531
#ifdef WITH_OPENMP
532
          call obj%timer%start("OpenMP parallel")
533
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
Pavel Kus's avatar
Pavel Kus committed
534

535
          my_thread = omp_get_thread_num()
Andreas Marek's avatar
Andreas Marek committed
536
          
537
          n_threads = omp_get_num_threads()
Pavel Kus's avatar
Pavel Kus committed
538

539
          n_iter = 0
Pavel Kus's avatar
Pavel Kus committed
540

541
          ! first calculate A*v part of (A + VU**T + UV**T)*v
542 543
          uc_p(1:l_cols,my_thread) = 0.
          ur_p(1:l_rows,my_thread) = 0.
544
#endif /* WITH_OPENMP */
545
          do i= 0, (istep-2)/tile_size
546 547
            l_col_beg = i*l_cols_per_tile+1
            l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
548 549
            if (l_col_end < l_col_beg) cycle
            do j = 0, i
550 551
              l_row_beg = j*l_rows_per_tile+1
              l_row_end = min(l_rows,(j+1)*l_rows_per_tile)
552
              if (l_row_end < l_row_beg) cycle
Pavel Kus's avatar
Pavel Kus committed
553
#ifdef WITH_OPENMP
554
              if (mod(n_iter,n_threads) == my_thread) then
555
                if (wantDebug) call obj%timer%start("blas")
556 557
                call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
                                    l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
558 559 560
                                    ONE, a_mat(l_row_beg,l_col_beg), lda,         &
                                    v_row(l_row_beg), 1, ONE, uc_p(l_col_beg,my_thread), 1)

561
                if (i/=j) then
562 563 564 565
                  call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1,          &
                                      ONE, a_mat(l_row_beg,l_col_beg), lda, v_col(l_col_beg), 1,  &

                                      ONE, ur_p(l_row_beg,my_thread), 1)
566
                endif
567
                if (wantDebug) call obj%timer%stop("blas")
568 569
              endif
              n_iter = n_iter+1
Pavel Kus's avatar
Pavel Kus committed
570
#else /* WITH_OPENMP */
571

572 573
              ! multiplication by blocks is efficient only for CPU
              ! for GPU we introduced 2 other ways, either by stripes (more simmilar to the original
574
              ! CPU implementation) or by one large matrix Vector multiply
575
              if (.not. useGPU) then
576
                if (wantDebug) call obj%timer%start("blas")
577 578 579 580 581
                call PRECISION_GEMV(BLAS_TRANS_OR_CONJ,  &
                            l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
                            ONE, a_mat(l_row_beg, l_col_beg), lda,         &
                            v_row(l_row_beg), 1,                           &
                            ONE, u_col(l_col_beg), 1)
582 583

                if (i/=j) then
584

585
                  call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1,  &
586 587 588
                                      ONE, a_mat(l_row_beg,l_col_beg), lda,               &
                                      v_col(l_col_beg), 1,                                &
                                      ONE, u_row(l_row_beg), 1)
589
                endif
590
                if (wantDebug) call obj%timer%stop("blas")
591
              endif ! not useGPU
Pavel Kus's avatar
Pavel Kus committed
592 593

#endif /* WITH_OPENMP */
594
            enddo  ! j=0,i
595 596
          enddo  ! i=0,(istep-2)/tile_size

597
          if (useGPU) then
598 599 600 601
            if(mat_vec_as_one_block) then
              ! Unlike for CPU, we (for each MPI thread) do just one large mat-vec multiplication
              ! this requires altering of the algorithm when later explicitly updating the matrix
              ! after max_stored_uv is reached : we need to update all tiles, not only those above diagonal
602
              if (wantDebug) call obj%timer%start("cublas")
603 604 605 606 607 608 609 610 611 612 613 614 615 616
              call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, l_rows,l_cols,  &
                                        ONE, a_dev, lda,                   &
                                        v_row_dev , 1,                          &
                                        ONE, u_col_dev, 1)

       ! todo: try with non transposed!!!
!                 if(i/=j) then
!                   call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,  &
!                                             ONE, a_dev + a_offset, lda,                        &
!                                             v_col_dev + (l_col_beg - 1) *                      &
!                                             size_of_datatype, 1,                          &
!                                             ONE, u_row_dev + (l_row_beg - 1) *                 &
!                                             size_of_datatype, 1)
!                 endif
617
              if (wantDebug) call obj%timer%stop("cublas")
618

Andreas Marek's avatar
Andreas Marek committed
619 620
            else
              !perform multiplication by stripes - it is faster than by blocks, since we call cublas with
621 622 623 624 625
              !larger matrices. In general, however, this algorithm is very simmilar to the one with CPU
              do i=0,(istep-2)/tile_size
                  l_col_beg = i*l_cols_per_tile+1
                  l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
                  if(l_col_end<l_col_beg) cycle
Andreas Marek's avatar
Andreas Marek committed
626

627 628
                  l_row_beg = 1
                  l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
Andreas Marek's avatar
Andreas Marek committed
629

630 631
                  a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
                          size_of_datatype
Andreas Marek's avatar
Andreas Marek committed
632

633 634 635 636 637 638
                  call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
                              l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
                              ONE, a_dev + a_offset, lda,  &
                              v_row_dev + (l_row_beg - 1) * size_of_datatype, 1,  &
                              ONE, u_col_dev + (l_col_beg - 1) * size_of_datatype, 1)
              enddo
Andreas Marek's avatar
Andreas Marek committed
639

640 641 642 643
              do i=0,(istep-2)/tile_size
                  l_col_beg = i*l_cols_per_tile+1
                  l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
                  if(l_col_end<l_col_beg) cycle
Andreas Marek's avatar
Andreas Marek committed
644

645 646 647 648 649
                  l_row_beg = 1
                  l_row_end = min(l_rows,i*l_rows_per_tile)

                  a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
                          size_of_datatype
Andreas Marek's avatar
Andreas Marek committed
650 651

                  call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
652 653 654
                              ONE, a_dev + a_offset, lda, &
                              v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
                              ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
Andreas Marek's avatar
Andreas Marek committed
655 656 657
              enddo
            end if !multiplication as one block / per stripes

658
            successCUDA = cuda_memcpy(loc(u_col(1)), u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
659
            check_memcpy_cuda("tridiag: u_col_dev 1", successCUDA)
660

661
            successCUDA = cuda_memcpy(loc(u_row(1)), u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
662
            check_memcpy_cuda("tridiag: u_row_dev 1", successCUDA)
663
          endif
664

665
!              call PRECISION_SYMV('U', l_cols,  &
Pavel Kus's avatar
Pavel Kus committed
666
!                         1.d0, a_mat, ubound(a_mat,1),  &
667
!                         v_row, 1,  &
668
!                         0.d0, u_col, 1)
669 670 671

!            endif ! useGPU

Pavel Kus's avatar
Pavel Kus committed
672 673
#ifdef WITH_OPENMP
!$OMP END PARALLEL
674
          call obj%timer%stop("OpenMP parallel")
Pavel Kus's avatar
Pavel Kus committed
675

676 677 678 679 680
          do i=0,max_threads-1
            u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i)
            u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i)
          enddo
#endif /* WITH_OPENMP */
Pavel Kus's avatar
Pavel Kus committed
681

Andreas Marek's avatar
Andreas Marek committed
682
          ! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v
683
          if (n_stored_vecs > 0) then
684
            if (wantDebug) call obj%timer%start("blas")
685
#if REALCASE == 1
686
            call PRECISION_GEMV('T',     &
687 688
#endif
#if COMPLEXCASE == 1
689
            call PRECISION_GEMV('C',     &
690
#endif
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
691
                                l_rows, 2*n_stored_vecs,   &
692 693 694 695 696 697
                                ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1),   &
                                v_row,  1, ZERO, aux, 1)

            call PRECISION_GEMV('N', l_cols, 2*n_stored_vecs,   &
                                ONE, uv_stored_cols, ubound(uv_stored_cols,dim=1),   &
                                aux, 1, ONE, u_col,  1)
698
            if (wantDebug) call obj%timer%stop("blas")
699
          endif
Pavel Kus's avatar
Pavel Kus committed
700

701
        endif  ! (l_rows>0 .and. l_cols>0)
Pavel Kus's avatar
Pavel Kus committed
702

703
        ! Sum up all u_row(:) parts along rows and add them to the u_col(:) parts
Pavel Kus's avatar
Pavel Kus committed
704
        ! on the processors containing the diagonal
705
        ! This is only necessary if u_row has been calculated, i.e. if the
Pavel Kus's avatar
Pavel Kus committed
706 707 708
        ! global tile size is smaller than the global remaining matrix

        if (tile_size < istep-1) then
709

Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
710 711 712 713
          call elpa_reduce_add_vectors_&
          &MATH_DATATYPE&
          &_&
          &PRECISION &
714
          (obj, u_row, ubound(u_row,dim=1), mpi_comm_rows, u_col, ubound(u_col,dim=1), &
Andreas Marek's avatar
Andreas Marek committed
715
           mpi_comm_cols, istep-1, 1, nblk, max_threads)
716

Pavel Kus's avatar
Pavel Kus committed
717 718
        endif

719
        ! Sum up all the u_col(:) parts, transpose u_col -> u_row
Pavel Kus's avatar
Pavel Kus committed
720 721

        if (l_cols>0) then
722
          tmp(1:l_cols) = u_col(1:l_cols)
Pavel Kus's avatar
Pavel Kus committed
723
#ifdef WITH_MPI
724
          if (wantDebug) call obj%timer%start("mpi_communication")
725
          call mpi_allreduce(tmp, u_col, l_cols, MPI_MATH_DATATYPE_PRECISION,    &
726
                             MPI_SUM, mpi_comm_rows, mpierr)
727
          if (wantDebug) call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
728
#else /* WITH_MPI */