elpa1_tridiag_template.F90 46 KB
Newer Older
Pavel Kus's avatar
Pavel Kus committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

55
#include "../general/sanity.F90"
56
57
58
59

#undef SAVE_MATR
#ifdef DOUBLE_PRECISION_REAL
#define SAVE_MATR(name, iteration) \
60
call prmat(na, useGpu, a_mat, a_dev, matrixRows, matrixCols, nblk, my_prow, my_pcol, np_rows, np_cols, name, iteration)
61
62
63
#else
#define SAVE_MATR(name, iteration)
#endif
64
65
66
67
68

!> \brief Reduces a distributed symmetric matrix to tridiagonal form (like Scalapack Routine PDSYTRD)
!>
!  Parameters
!
69
!> \param obj	      object of elpa_type
70
71
!> \param na          Order of matrix
!>
72
!> \param a_mat(matrixRows,matrixCols)    Distributed matrix which should be reduced.
73
74
75
76
!>              Distribution is like in Scalapack.
!>              Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
!>              a(:,:) is overwritten on exit with the Householder vectors
!>
77
!> \param matrixRows         Leading dimension of a
78
79
80
81
82
83
84
85
!>
!> \param nblk        blocksize of cyclic distribution, must be the same in both directions!
!>
!> \param matrixCols  local columns of matrix
!>
!> \param mpi_comm_rows        MPI-Communicator for rows
!> \param mpi_comm_cols        MPI-Communicator for columns
!>
86
!> \param d_vec(na)       Diagonal elements (returned), identical on all processors
87
!>
88
!> \param e_vec(na)       Off-Diagonal elements (returned), identical on all processors
89
90
!>
!> \param tau(na)     Factors for the Householder vectors (returned), needed for back transformation
Pavel Kus's avatar
Pavel Kus committed
91
92
!>
!> \param useGPU      If true,  GPU version of the subroutine will be used
93
!> \param wantDebug   if true more debug information
94
!>
95
96
97
98
    subroutine tridiag_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
99
    (obj, na, a_mat, matrixRows, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU, wantDebug, max_threads)
100
101
      use cuda_functions
      use iso_c_binding
Pavel Kus's avatar
Pavel Kus committed
102
      use precision
103
      use elpa_abstract_impl
104
      use matrix_plot
105
      use elpa_omp
106
      use elpa_blas_interfaces
107

Pavel Kus's avatar
Pavel Kus committed
108
      implicit none
Pavel Kus's avatar
Pavel Kus committed
109
#include "../general/precision_kinds.F90"
110
111
      class(elpa_abstract_impl_t), intent(inout)    :: obj
      integer(kind=ik), intent(in)                  :: na, matrixRows, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
112
      logical, intent(in)                           :: useGPU, wantDebug
Carolin Penke's avatar
Carolin Penke committed
113
114
      integer(kind=c_int)                           :: skewsymmetric
      logical                                       :: isSkewsymmetric
115

Andreas Marek's avatar
Andreas Marek committed
116
      MATH_DATATYPE(kind=rck), intent(out)          :: tau(na)
Pavel Kus's avatar
Pavel Kus committed
117
#ifdef USE_ASSUMED_SIZE
118
      MATH_DATATYPE(kind=rck), intent(inout)        :: a_mat(matrixRows,*)
Pavel Kus's avatar
Pavel Kus committed
119
#else
120
      MATH_DATATYPE(kind=rck), intent(inout)        :: a_mat(matrixRows,matrixCols)
121
#endif
Andreas Marek's avatar
Andreas Marek committed
122
123
      real(kind=rk), intent(out)                    :: d_vec(na)
      real(kind=rk), intent(out)                    :: e_vec(na)
124
      integer(kind=ik), parameter                   :: max_stored_uv = 32
125
      logical,          parameter                   :: mat_vec_as_one_block = .true.
Pavel Kus's avatar
Pavel Kus committed
126
127

      ! id in processor row and column and total numbers of processor rows and columns
Andreas Marek's avatar
Andreas Marek committed
128
      integer(kind=ik)                              :: my_prow, my_pcol, np_rows, np_cols
129
130
      integer(kind=MPI_KIND)                        :: my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
      integer(kind=MPI_KIND)                        :: mpierr
131
132
      integer(kind=ik)                              :: totalblocks, max_loc_block_rows, max_loc_block_cols, max_local_rows, &
                                                       max_local_cols
133
      ! updated after each istep (in the main cycle) to contain number of
Pavel Kus's avatar
Pavel Kus committed
134
      ! local columns and rows of the remaining part of the matrix
135
136
137
      !integer(kind=ik)                             :: l_cols, l_rows
      integer(kind=ik)                              :: l_cols, l_rows
      integer(kind=ik)                              :: n_stored_vecs
138

139
140
141
      integer(kind=C_intptr_T)                      :: a_dev, v_row_dev, v_col_dev, u_row_dev, u_col_dev, vu_stored_rows_dev, &
                                                       uv_stored_cols_dev
      logical                                       :: successCUDA
142

143
144
      integer(kind=ik)                              :: istep, i, j, l_col_beg, l_col_end, l_row_beg, l_row_end
      integer(kind=ik)                              :: tile_size, l_rows_per_tile, l_cols_per_tile
Andreas Marek's avatar
Andreas Marek committed
145
      integer(kind=c_intptr_t)                      :: a_offset
Pavel Kus's avatar
Pavel Kus committed
146

Andreas Marek's avatar
Andreas Marek committed
147
      integer(kind=ik), intent(in)                  :: max_threads
Pavel Kus's avatar
Pavel Kus committed
148
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
149
      integer(kind=ik)                              :: my_thread, n_threads, n_iter
Pavel Kus's avatar
Pavel Kus committed
150
151
#endif

152
      real(kind=rk)                                 :: vnorm2
153
      MATH_DATATYPE(kind=rck)                       :: vav, x, aux(2*max_stored_uv), aux1(2), aux2(2), vrl, xf
154
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
155
      complex(kind=rck)                             :: aux3(1)
156
#endif
157

Andreas Marek's avatar
Andreas Marek committed
158
      MATH_DATATYPE(kind=rck), allocatable          :: tmp(:)
Andreas Marek's avatar
Andreas Marek committed
159
160
161
      MATH_DATATYPE(kind=rck), allocatable          :: v_row(:), &   ! used to store calculated Householder Vector
                                                       v_col(:), &   ! the same Vector, but transposed 
                                                                     ! - differently distributed among MPI tasks
162
163
                                                       u_row(:), &
                                                       u_col(:)
164
      ! the following two matrices store pairs of vectors v and u calculated in each step
165
166
      ! at most max_stored_uv Vector pairs are stored, than the matrix A_i is explicitli updated
      ! u and v are stored both in row and Vector forms
167
168
      ! pattern: v1,u1,v2,u2,v3,u3,....
      ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
Andreas Marek's avatar
Andreas Marek committed
169
      MATH_DATATYPE(kind=rck), allocatable         :: vu_stored_rows(:,:)
170
      ! pattern: u1,v1,u2,v2,u3,v3,....
Andreas Marek's avatar
Andreas Marek committed
171
      MATH_DATATYPE(kind=rck), allocatable         :: uv_stored_cols(:,:)
172

Pavel Kus's avatar
Pavel Kus committed
173
#ifdef WITH_OPENMP
174
      MATH_DATATYPE(kind=rck), allocatable         :: ur_p(:,:), uc_p(:,:)
Pavel Kus's avatar
Pavel Kus committed
175
176
#endif

177
      real(kind=rk), allocatable                    :: tmp_real(:)
178
      integer(kind=ik)                              :: min_tile_size, error
179
180
      integer(kind=ik)                              :: istat
      character(200)                                :: errorMessage
181
      character(20)                                 :: gpuString
182
      integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
183
184
185
                                                                          &PRECISION&
                                                                          &_&
                                                                          &MATH_DATATYPE
Carolin Penke's avatar
Carolin Penke committed
186
187
188
189
      call obj%get("is_skewsymmetric",skewsymmetric,istat)
      if (istat .ne. ELPA_OK) then
           print *,"Problem getting option. Aborting..."
           stop
190
      endif
Carolin Penke's avatar
Carolin Penke committed
191
      isSkewsymmetric = (skewsymmetric == 1)
192

193
194
195
196
197
198
      if(useGPU) then
        gpuString = "_gpu"
      else
        gpuString = ""
      endif

199
      call obj%timer%start("tridiag_&
200
      &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
201
      &" // &
202
203
      PRECISION_SUFFIX // &
      gpuString )
204

205

206
      if (wantDebug) call obj%timer%start("mpi_communication")
207
208
209
210
211
212
213
214
215
      call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND), my_prowMPI, mpierr)
      call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND), np_rowsMPI, mpierr)
      call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND), my_pcolMPI, mpierr)
      call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND), np_colsMPI, mpierr)

      my_prow = int(my_prowMPI, kind=c_int)
      np_rows = int(np_rowsMPI, kind=c_int)
      my_pcol = int(my_pcolMPI, kind=c_int)
      np_cols = int(np_colsMPI, kind=c_int)
216
      if (wantDebug) call obj%timer%stop("mpi_communication")
217

Pavel Kus's avatar
Pavel Kus committed
218
      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
219
220
221
      ! seems that tile is a square submatrix, consisting by several blocks
      ! it is a smallest possible square submatrix, where blocks being distributed among
      ! processors are "aligned" in both rows and columns
222
      !  -----------------
223
224
225
226
227
228
229
230
231
232
233
234
      ! | 1 4 | 1 4 | 1 4 | ...
      ! | 2 5 | 2 5 | 2 5 | ...
      ! | 3 6 | 3 6 | 3 6 | ...
      !  ----------------- ...
      ! | 1 4 | 1 4 | 1 4 | ...
      ! | 2 5 | 2 5 | 2 5 | ...
      ! | 3 6 | 3 6 | 3 6 | ...
      !  ----------------- .
      !   : :   : :   : :    .
      !   : :   : :   : :      .
      !
      ! this is a tile, where each number represents block, assigned to a processor with the shown number
235
      ! size of this small block is nblk
236
      ! Image is for situation with 6 processors, 3 processor rows and 2 columns
237
      ! tile_size is thus nblk * 6
238
      !
Pavel Kus's avatar
Pavel Kus committed
239
      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254

      ! make tile_size a smallest possible multiple of previously defined tile size, such that it is
      ! larger or equal to min_tile_size
      ! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value
      ! it can, however, be set by the user
      call obj%get("min_tile_size", min_tile_size ,error)
      if (error .ne. ELPA_OK) then
        print *,"Problem setting option. Aborting..."
        stop
      endif
      if(min_tile_size == 0) then
        ! not set by the user, use the default value
        min_tile_size = 128*max(np_rows, np_cols)
      endif
      tile_size = ((min_tile_size-1)/tile_size+1)*tile_size
255

256
257
      l_rows_per_tile = tile_size/np_rows ! local rows of a tile
      l_cols_per_tile = tile_size/np_cols ! local cols of a tile
258

Pavel Kus's avatar
Pavel Kus committed
259
      totalblocks = (na-1)/nblk + 1
260
261
      max_loc_block_rows = (totalblocks-1)/np_rows + 1
      max_loc_block_cols = (totalblocks-1)/np_cols + 1
Pavel Kus's avatar
Pavel Kus committed
262

263
264
265
      ! localy owned submatrix has size at most max_local_rows x max_local_cols at each processor
      max_local_rows = max_loc_block_rows*nblk
      max_local_cols = max_loc_block_cols*nblk
266

267
268
269
      ! allocate memmory for vectors
      ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
      ! todo: if something has length max_local_rows, it is actually a column, no?
270
      ! todo: probably one should read it as v_row = Vector v distributed among rows
271
      !
Pavel Kus's avatar
Pavel Kus committed
272
      allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
273
274
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "tmp", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
275

276
      ! allocate v_row 1 element longer to allow store and broadcast tau together with it
277
      allocate(v_row(max_local_rows+1), stat=istat, errmsg=errorMessage)
278
279
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "v_row", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
280

281
      allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage)
282
283
284
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "u_row", istat, errorMessage)

285
      allocate(v_col(max_local_cols), stat=istat, errmsg=errorMessage)
286
287
288
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "v_col", istat, errorMessage)

289
      allocate(u_col(max_local_cols), stat=istat, errmsg=errorMessage)
290
291
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "u_col", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
292
293
294

#ifdef WITH_OPENMP
      allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
295
296
297
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "ur_p", istat, errorMessage)

Pavel Kus's avatar
Pavel Kus committed
298
      allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
299
300
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "uc_p", istat, errorMessage)
301
#endif /* WITH_OPENMP */
Pavel Kus's avatar
Pavel Kus committed
302
303

      tmp = 0
304
305
306
307
      v_row = 0
      u_row = 0
      v_col = 0
      u_col = 0
Pavel Kus's avatar
Pavel Kus committed
308

309
      allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
310
311
312
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)

313
      allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
314
315
316
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage)

317
      if (useGPU) then
318
         successCUDA = cuda_malloc(v_row_dev, max_local_rows * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
319
         check_alloc_cuda("tridiag: v_row_dev", successCUDA)
320

321
322
         successCUDA = cuda_malloc(u_row_dev, max_local_rows * size_of_datatype)

Andreas Marek's avatar
Andreas Marek committed
323
         check_alloc_cuda("tridiag: u_row_dev", successCUDA)
324

325
         successCUDA = cuda_malloc(v_col_dev, max_local_cols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
326
         check_alloc_cuda("tridiag: v_col_dev", successCUDA)
327

328
         successCUDA = cuda_malloc(u_col_dev, max_local_cols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
329
         check_alloc_cuda("tridiag: u_col_dev", successCUDA)
330

331
         successCUDA = cuda_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
332
         check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
333

334
         successCUDA = cuda_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
335
         check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
336
      endif !useGPU
Pavel Kus's avatar
Pavel Kus committed
337

338

Pavel Kus's avatar
Pavel Kus committed
339
340
      d_vec(:) = 0
      e_vec(:) = 0
Pavel Kus's avatar
Pavel Kus committed
341
342
      tau(:) = 0

343
      n_stored_vecs = 0
Pavel Kus's avatar
Pavel Kus committed
344

Pavel Kus's avatar
Pavel Kus committed
345
346
      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a_mat
      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a_mat
347

348
      if (my_prow == prow(na, nblk, np_rows) .and. my_pcol == pcol(na, nblk, np_cols)) &
349
350
351
352
#if COMPLEXCASE == 1
        d_vec(na) = real(a_mat(l_rows,l_cols), kind=rk)
#endif
#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
353
        d_vec(na) = a_mat(l_rows,l_cols)
354
#endif
355

356
357
      if (useGPU) then
        ! allocate memmory for matrix A on the device and than copy the matrix
358

359
        successCUDA = cuda_malloc(a_dev, matrixRows * matrixCols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
360
        check_alloc_cuda("tridiag: a_dev", successCUDA)
361

Andreas Marek's avatar
Andreas Marek committed
362
        successCUDA = cuda_memcpy(a_dev, int(loc(a_mat(1,1)),kind=c_intptr_t), &
363
                      matrixRows * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
Pavel Kus's avatar
Pavel Kus committed
364
        check_memcpy_cuda("tridiag: a_dev", successCUDA)
365
      endif
Pavel Kus's avatar
Pavel Kus committed
366

367
      ! main cycle of tridiagonalization
368
      ! in each step, 1 Householder Vector is calculated
369
      do istep = na, 3 ,-1
Pavel Kus's avatar
Pavel Kus committed
370

371
372
373
374
        ! Calculate number of local rows and columns of the still remaining matrix
        ! on the local processor
        l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1)
        l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1)
Pavel Kus's avatar
Pavel Kus committed
375

376
        ! Calculate Vector for Householder transformation on all procs
377
        ! owning column istep
Pavel Kus's avatar
Pavel Kus committed
378

379
        if (my_pcol == pcol(istep, nblk, np_cols)) then
Pavel Kus's avatar
Pavel Kus committed
380

381
          ! Get Vector to be transformed; distribute last element and norm of
382
          ! remaining elements to all procs in current column
Pavel Kus's avatar
Pavel Kus committed
383

384
385
          ! copy l_cols + 1 column of A to v_row
          if (useGPU) then
386
            a_offset = l_cols * matrixRows * size_of_datatype
Andreas Marek's avatar
Andreas Marek committed
387
388
            ! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, 
            ! (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
389

Andreas Marek's avatar
Andreas Marek committed
390
391
            successCUDA = cuda_memcpy(int(loc(v_row(1)),kind=c_intptr_t), &
                                      a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
392
            check_memcpy_cuda("tridiag a_dev 1", successCUDA)
393
394
395
          else
            v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
          endif
396

397
            if (n_stored_vecs > 0 .and. l_rows > 0) then
398
              if (wantDebug) call obj%timer%start("blas")
399
400
401
#if COMPLEXCASE == 1
              aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
#endif
402

403
              call PRECISION_GEMV('N',   &
Andreas Marek's avatar
Andreas Marek committed
404
405
                                  int(l_rows,kind=BLAS_KIND), int(2*n_stored_vecs,kind=BLAS_KIND), &
                                  ONE, vu_stored_rows, int(ubound(vu_stored_rows,dim=1),kind=BLAS_KIND), &
406
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
407
                                  uv_stored_cols(l_cols+1,1), int(ubound(uv_stored_cols,dim=1),kind=BLAS_KIND), &
408
409
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
410
                                   aux, 1_BLAS_KIND,  &
411

412
#endif
Andreas Marek's avatar
Andreas Marek committed
413
                                  ONE, v_row, 1_BLAS_KIND)
414
               if (wantDebug) call obj%timer%stop("blas")
415

416
            endif
417

418
            if(my_prow == prow(istep-1, nblk, np_rows)) then
419
420
               aux1(1) = dot_product(v_row(1:l_rows-1),v_row(1:l_rows-1))
               aux1(2) = v_row(l_rows)
Pavel Kus's avatar
Pavel Kus committed
421
            else
422
               aux1(1) = dot_product(v_row(1:l_rows),v_row(1:l_rows))
Pavel Kus's avatar
Pavel Kus committed
423
424
425
426
               aux1(2) = 0.
            endif

#ifdef WITH_MPI
427
            if (wantDebug) call obj%timer%start("mpi_communication")
428
429
            call mpi_allreduce(aux1, aux2, 2_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, &
                               MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
430
            if (wantDebug) call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
431
#else /* WITH_MPI */
432
          aux2 = aux1
Pavel Kus's avatar
Pavel Kus committed
433
#endif /* WITH_MPI */
434
435

#if REALCASE == 1
436
          vnorm2 = aux2(1)
437
438
439
440
#endif
#if COMPLEXCASE == 1
          vnorm2 = real(aux2(1),kind=rk)
#endif
441
          vrl    = aux2(2)
Pavel Kus's avatar
Pavel Kus committed
442

443
          ! Householder transformation
444
#if REALCASE == 1
445
          call hh_transform_real_&
446
447
#endif
#if COMPLEXCASE == 1
448
          call hh_transform_complex_&
449
#endif
Andreas Marek's avatar
cleanup    
Andreas Marek committed
450
               &PRECISION &
451
                             (obj, vrl, vnorm2, xf, tau(istep), wantDebug)
452
          ! Scale v_row and store Householder Vector for back transformation
Pavel Kus's avatar
Pavel Kus committed
453

454
          v_row(1:l_rows) = v_row(1:l_rows) * xf
455
          if (my_prow == prow(istep-1, nblk, np_rows)) then
456
457
458
            v_row(l_rows) = 1.

            ! vrl is newly computed off-diagonal element of the final tridiagonal matrix
459
#if REALCASE == 1
460
            e_vec(istep-1) = vrl
461
462
463
464
#endif
#if COMPLEXCASE == 1
            e_vec(istep-1) = real(vrl,kind=rk)
#endif
465
466
          endif

467
          ! store Householder Vector for back transformation
468
          a_mat(1:l_rows,l_cols+1) = v_row(1:l_rows)
469

470
          ! add tau after the end of actuall v_row, to be broadcasted with it
471
          v_row(l_rows+1) = tau(istep)
472
         endif !(my_pcol == pcol(istep, nblk, np_cols))
473

474
!          SAVE_MATR("HH vec stored", na - istep + 1)
Pavel Kus's avatar
Pavel Kus committed
475
476

#ifdef WITH_MPI
477
         if (wantDebug) call obj%timer%start("mpi_communication")
478
         ! Broadcast the Householder Vector (and tau) along columns
479
480
         call MPI_Bcast(v_row, int(l_rows+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION,    &
                        int(pcol(istep, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
481
         if (wantDebug) call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
482
483
#endif /* WITH_MPI */

484
485
        !recover tau, which has been broadcasted together with v_row
        tau(istep) =  v_row(l_rows+1)
486

487
        ! Transpose Householder Vector v_row -> v_col
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
488
        call elpa_transpose_vectors_&
Andreas Marek's avatar
cleanup    
Andreas Marek committed
489
490
491
             &MATH_DATATYPE&
             &_&
             &PRECISION &
492
                   (obj, v_row, ubound(v_row,dim=1), mpi_comm_rows, v_col, ubound(v_col,dim=1), mpi_comm_cols, &
Andreas Marek's avatar
Andreas Marek committed
493
                    1, istep-1, 1, nblk, max_threads)
494

495
        ! Calculate u = (A + VU**T + UV**T)*v
Pavel Kus's avatar
Pavel Kus committed
496

497
498
        ! For cache efficiency, we use only the upper half of the matrix tiles for this,
        ! thus the result is partly in u_col(:) and partly in u_row(:)
Pavel Kus's avatar
Pavel Kus committed
499

500
501
        u_col(1:l_cols) = 0
        u_row(1:l_rows) = 0
502
        if (l_rows > 0 .and. l_cols> 0 ) then
503
          if(useGPU) then
504
            successCUDA = cuda_memset(u_col_dev, 0, l_cols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
505
            check_memcpy_cuda("tridiag: u_col_dev", successCUDA)
506

507
            successCUDA = cuda_memset(u_row_dev, 0, l_rows * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
508
            check_memcpy_cuda("tridiag: u_row_dev", successCUDA)
509

Andreas Marek's avatar
Andreas Marek committed
510
511
            successCUDA = cuda_memcpy(v_col_dev, int(loc(v_col(1)),kind=c_intptr_t), &
                          l_cols * size_of_datatype, cudaMemcpyHostToDevice)
512

Andreas Marek's avatar
Andreas Marek committed
513
            check_memcpy_cuda("tridiag: v_col_dev", successCUDA)
514

Andreas Marek's avatar
Andreas Marek committed
515
516
            successCUDA = cuda_memcpy(v_row_dev, int(loc(v_row(1)),kind=c_intptr_t), &
                                      l_rows * size_of_datatype, cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
517
            check_memcpy_cuda("tridiag: v_row_dev", successCUDA)
518
519
          endif ! useGU

Pavel Kus's avatar
Pavel Kus committed
520
#ifdef WITH_OPENMP
521
          call obj%timer%start("OpenMP parallel")
522
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
Pavel Kus's avatar
Pavel Kus committed
523

524
          my_thread = omp_get_thread_num()
Andreas Marek's avatar
Andreas Marek committed
525
          
526
          n_threads = omp_get_num_threads()
Pavel Kus's avatar
Pavel Kus committed
527

528
          n_iter = 0
Pavel Kus's avatar
Pavel Kus committed
529

530
          ! first calculate A*v part of (A + VU**T + UV**T)*v
531
532
          uc_p(1:l_cols,my_thread) = 0.
          ur_p(1:l_rows,my_thread) = 0.
533
#endif /* WITH_OPENMP */
534
          do i= 0, (istep-2)/tile_size
535
536
            l_col_beg = i*l_cols_per_tile+1
            l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
537
538
            if (l_col_end < l_col_beg) cycle
            do j = 0, i
539
540
              l_row_beg = j*l_rows_per_tile+1
              l_row_end = min(l_rows,(j+1)*l_rows_per_tile)
541
              if (l_row_end < l_row_beg) cycle
Pavel Kus's avatar
Pavel Kus committed
542
#ifdef WITH_OPENMP
543
              if (mod(n_iter,n_threads) == my_thread) then
544
                if (wantDebug) call obj%timer%start("blas")
545
                call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
Andreas Marek's avatar
Andreas Marek committed
546
                                    int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
547
                                    ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND),         &
Andreas Marek's avatar
Andreas Marek committed
548
                                    v_row(l_row_beg), 1_BLAS_KIND, ONE, uc_p(l_col_beg,my_thread), 1_BLAS_KIND)
549

550
                if (i/=j) then
Carolin Penke's avatar
Carolin Penke committed
551
                  if (isSkewsymmetric) then
552
553
554
555
                    call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1, &
                                        kind=BLAS_KIND), &
                                        -ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), v_col(l_col_beg),     &
                                        1_BLAS_KIND, ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
556

Carolin Penke's avatar
Carolin Penke committed
557
                  else
558
559
560
561
                    call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1, &
                                        kind=BLAS_KIND), &
                                        ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), v_col(l_col_beg), &
                                        1_BLAS_KIND, ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
Carolin Penke's avatar
Carolin Penke committed
562
                  endif
563
                endif
564
                if (wantDebug) call obj%timer%stop("blas")
565
566
              endif
              n_iter = n_iter+1
Pavel Kus's avatar
Pavel Kus committed
567
#else /* WITH_OPENMP */
568

569
570
              ! multiplication by blocks is efficient only for CPU
              ! for GPU we introduced 2 other ways, either by stripes (more simmilar to the original
571
              ! CPU implementation) or by one large matrix Vector multiply
572
              if (.not. useGPU) then
573
                if (wantDebug) call obj%timer%start("blas")
574
                call PRECISION_GEMV(BLAS_TRANS_OR_CONJ,  &
Andreas Marek's avatar
Andreas Marek committed
575
                            int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
576
                            ONE, a_mat(l_row_beg, l_col_beg), int(matrixRows,kind=BLAS_KIND),         &
Andreas Marek's avatar
Andreas Marek committed
577
578
                            v_row(l_row_beg), 1_BLAS_KIND,                           &
                            ONE, u_col(l_col_beg), 1_BLAS_KIND)
579
580

                if (i/=j) then
Carolin Penke's avatar
Carolin Penke committed
581
                  if (isSkewsymmetric) then
582
                    call PRECISION_GEMV('N',int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
583
                                        -ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND),               &
584
585
                                        v_col(l_col_beg), 1_BLAS_KIND, ONE, u_row(l_row_beg), 1_BLAS_KIND)

Carolin Penke's avatar
Carolin Penke committed
586
                  else
587
                    call PRECISION_GEMV('N',int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
588
                                        ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND),               &
589
                                        v_col(l_col_beg), 1_BLAS_KIND, ONE, u_row(l_row_beg), 1_BLAS_KIND)
Carolin Penke's avatar
Carolin Penke committed
590
                  endif
591
                endif
592
                if (wantDebug) call obj%timer%stop("blas")
593
              endif ! not useGPU
Pavel Kus's avatar
Pavel Kus committed
594
595

#endif /* WITH_OPENMP */
596
            enddo  ! j=0,i
597
598
          enddo  ! i=0,(istep-2)/tile_size

599
          if (useGPU) then
600
            if (mat_vec_as_one_block) then
601
602
603
              ! Unlike for CPU, we (for each MPI thread) do just one large mat-vec multiplication
              ! this requires altering of the algorithm when later explicitly updating the matrix
              ! after max_stored_uv is reached : we need to update all tiles, not only those above diagonal
604
              if (wantDebug) call obj%timer%start("cublas")
605
              call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, l_rows,l_cols,  &
606
                                        ONE, a_dev, matrixRows,                   &
607
608
609
610
611
612
                                        v_row_dev , 1,                          &
                                        ONE, u_col_dev, 1)

       ! todo: try with non transposed!!!
!                 if(i/=j) then
!                   call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,  &
613
!                                             ONE, a_dev + a_offset, matrixRows,                        &
614
615
616
617
618
!                                             v_col_dev + (l_col_beg - 1) *                      &
!                                             size_of_datatype, 1,                          &
!                                             ONE, u_row_dev + (l_row_beg - 1) *                 &
!                                             size_of_datatype, 1)
!                 endif
619
              if (wantDebug) call obj%timer%stop("cublas")
620

621
            else  ! mat_vec_as_one_block
Andreas Marek's avatar
Andreas Marek committed
622
              !perform multiplication by stripes - it is faster than by blocks, since we call cublas with
623
624
625
626
627
              !larger matrices. In general, however, this algorithm is very simmilar to the one with CPU
              do i=0,(istep-2)/tile_size
                  l_col_beg = i*l_cols_per_tile+1
                  l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
                  if(l_col_end<l_col_beg) cycle
Andreas Marek's avatar
Andreas Marek committed
628

629
630
                  l_row_beg = 1
                  l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
Andreas Marek's avatar
Andreas Marek committed
631

632
                  a_offset = ((l_row_beg-1) + (l_col_beg - 1) * matrixRows) * &
633
                          size_of_datatype
Andreas Marek's avatar
Andreas Marek committed
634

635
636
                  call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
                              l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
637
                              ONE, a_dev + a_offset, matrixRows,  &
638
639
640
                              v_row_dev + (l_row_beg - 1) * size_of_datatype, 1,  &
                              ONE, u_col_dev + (l_col_beg - 1) * size_of_datatype, 1)
              enddo
Andreas Marek's avatar
Andreas Marek committed
641

642
643
644
645
              do i=0,(istep-2)/tile_size
                  l_col_beg = i*l_cols_per_tile+1
                  l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
                  if(l_col_end<l_col_beg) cycle
Andreas Marek's avatar
Andreas Marek committed
646

647
648
649
                  l_row_beg = 1
                  l_row_end = min(l_rows,i*l_rows_per_tile)

650
                  a_offset = ((l_row_beg-1) + (l_col_beg - 1) * matrixRows) * &
651
                          size_of_datatype
Carolin Penke's avatar
Carolin Penke committed
652
653
                  if (isSkewsymmetric) then
                     call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
654
                                 -ONE, a_dev + a_offset, matrixRows, &
Carolin Penke's avatar
Carolin Penke committed
655
656
657
658
                                 v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
                                 ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
                  else
                     call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
659
                                 ONE, a_dev + a_offset, matrixRows, &
Carolin Penke's avatar
Carolin Penke committed
660
661
662
                                 v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
                                 ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
                  endif
Andreas Marek's avatar
Andreas Marek committed
663
664
665
              enddo
            end if !multiplication as one block / per stripes

Andreas Marek's avatar
Andreas Marek committed
666
667
            successCUDA = cuda_memcpy(int(loc(u_col(1)),kind=c_intptr_t), &
                          u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
668
            check_memcpy_cuda("tridiag: u_col_dev 1", successCUDA)
669

Andreas Marek's avatar
Andreas Marek committed
670
671
            successCUDA = cuda_memcpy(int(loc(u_row(1)),kind=c_intptr_t), &
                          u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
672
            check_memcpy_cuda("tridiag: u_row_dev 1", successCUDA)
673
          endif ! useGPU
674

Pavel Kus's avatar
Pavel Kus committed
675
676
#ifdef WITH_OPENMP
!$OMP END PARALLEL
677
          call obj%timer%stop("OpenMP parallel")
Pavel Kus's avatar
Pavel Kus committed
678

679
680
681
682
683
          do i=0,max_threads-1
            u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i)
            u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i)
          enddo
#endif /* WITH_OPENMP */
Pavel Kus's avatar
Pavel Kus committed
684

Andreas Marek's avatar
Andreas Marek committed
685
          ! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v
686
          if (n_stored_vecs > 0) then
687
            if (wantDebug) call obj%timer%start("blas")
688
#if REALCASE == 1
689
            call PRECISION_GEMV('T',     &
690
691
#endif
#if COMPLEXCASE == 1
692
            call PRECISION_GEMV('C',     &
693
#endif
Andreas Marek's avatar
Andreas Marek committed
694
695
696
                                int(l_rows,kind=BLAS_KIND), int(2*n_stored_vecs,kind=BLAS_KIND),   &
                                ONE, vu_stored_rows, int(ubound(vu_stored_rows,dim=1),kind=BLAS_KIND),   &
                                v_row,  1_BLAS_KIND, ZERO, aux, 1_BLAS_KIND)
697

Andreas Marek's avatar
Andreas Marek committed
698
699
700
            call PRECISION_GEMV('N', int(l_cols,kind=BLAS_KIND), int(2*n_stored_vecs,kind=BLAS_KIND),   &
                                ONE, uv_stored_cols, int(ubound(uv_stored_cols,dim=1),kind=BLAS_KIND),   &
                                aux, 1_BLAS_KIND, ONE, u_col,  1_BLAS_KIND)
701
            if (wantDebug) call obj%timer%stop("blas")
702
          endif
Pavel Kus's avatar
Pavel Kus committed
703

704
        endif  ! (l_rows>0 .and. l_cols>0)
Pavel Kus's avatar
Pavel Kus committed
705

706
        ! Sum up all u_row(:) parts along rows and add them to the u_col(:) parts
Pavel Kus's avatar
Pavel Kus committed
707
        ! on the processors containing the diagonal
708
        ! This is only necessary if u_row has been calculated, i.e. if the
Pavel Kus's avatar
Pavel Kus committed
709
710
711
        ! global tile size is smaller than the global remaining matrix

        if (tile_size < istep-1) then
712

Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
713
714
715
716
          call elpa_reduce_add_vectors_&
          &MATH_DATATYPE&
          &_&
          &PRECISION &
717
          (obj, u_row, ubound(u_row,dim=1), mpi_comm_rows, u_col, ubound(u_col,dim=1), &
Andreas Marek's avatar
Andreas Marek committed
718
           mpi_comm_cols, istep-1, 1, nblk, max_threads)
719

Pavel Kus's avatar
Pavel Kus committed
720
721
        endif

722
        ! Sum up all the u_col(:) parts, transpose u_col -> u_row
Pavel Kus's avatar
Pavel Kus committed
723
724

        if (l_cols>0) then
725
          tmp(1:l_cols) = u_col(1:l_cols)
Pavel Kus's avatar
Pavel Kus committed
726
#ifdef WITH_MPI
727
          if (wantDebug) call obj%timer%start("mpi_communication")
728
729
          call mpi_allreduce(tmp, u_col, int(l_cols,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION,    &
                             MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
730
          if (wantDebug) call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
731
#else /* WITH_MPI */
732
          u_col = tmp
Pavel Kus's avatar
Pavel Kus committed
733
734
#endif /* WITH_MPI */
        endif
Carolin Penke's avatar
Carolin Penke committed
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
        if (isSkewsymmetric) then
           call elpa_transpose_vectors_ss_&
           &MATH_DATATYPE&
           &_&
           &PRECISION &
           (obj, u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
            mpi_comm_rows, 1, istep-1, 1, nblk, max_threads)
        else
           call elpa_transpose_vectors_&
           &MATH_DATATYPE&
           &_&
           &PRECISION &
           (obj, u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
            mpi_comm_rows, 1, istep-1, 1, nblk, max_threads)
        endif
750

Pavel Kus's avatar
Pavel Kus committed
751
752
        ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
        x = 0
753
        if (l_cols>0)  &
754
           x = dot_product(v_col(1:l_cols),u_col(1:l_cols))
755

Pavel Kus's avatar
Pavel Kus committed
756
#ifdef WITH_MPI
757
        if (wantDebug) call obj%timer%start("mpi_communication")
758
        call mpi_allreduce(x, vav, 1_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, int(mpi_comm_cols,kind=MPI_KIND), mpierr)