elpa1_tridiag_template.F90 43.3 KB
Newer Older
Pavel Kus's avatar
Pavel Kus committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

55
#include "../general/sanity.F90"
56
57
58
59
60
61
62
63

#undef SAVE_MATR
#ifdef DOUBLE_PRECISION_REAL
#define SAVE_MATR(name, iteration) \
call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,name,iteration)
#else
#define SAVE_MATR(name, iteration)
#endif
64
65
66
67
68

!> \brief Reduces a distributed symmetric matrix to tridiagonal form (like Scalapack Routine PDSYTRD)
!>
!  Parameters
!
69
!> \param obj	      object of elpa_type
70
71
!> \param na          Order of matrix
!>
72
!> \param a_mat(lda,matrixCols)    Distributed matrix which should be reduced.
73
74
75
76
77
78
79
80
81
82
83
84
85
!>              Distribution is like in Scalapack.
!>              Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
!>              a(:,:) is overwritten on exit with the Householder vectors
!>
!> \param lda         Leading dimension of a
!>
!> \param nblk        blocksize of cyclic distribution, must be the same in both directions!
!>
!> \param matrixCols  local columns of matrix
!>
!> \param mpi_comm_rows        MPI-Communicator for rows
!> \param mpi_comm_cols        MPI-Communicator for columns
!>
86
!> \param d_vec(na)       Diagonal elements (returned), identical on all processors
87
!>
88
!> \param e_vec(na)       Off-Diagonal elements (returned), identical on all processors
89
90
!>
!> \param tau(na)     Factors for the Householder vectors (returned), needed for back transformation
Pavel Kus's avatar
Pavel Kus committed
91
92
!>
!> \param useGPU      If true,  GPU version of the subroutine will be used
93
!> \param wantDebug   if true more debug information
94
!>
95
96
97
98
    subroutine tridiag_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
99
    (obj, na, a_mat, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU, wantDebug)
100
101
      use cuda_functions
      use iso_c_binding
Pavel Kus's avatar
Pavel Kus committed
102
      use precision
103
      use elpa_abstract_impl
104
      use matrix_plot
Pavel Kus's avatar
Pavel Kus committed
105
      implicit none
Pavel Kus's avatar
Pavel Kus committed
106
#include "../general/precision_kinds.F90"
107
      class(elpa_abstract_impl_t), intent(inout) :: obj
108
      integer(kind=ik), intent(in)                  :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
109
      logical, intent(in)                           :: useGPU, wantDebug
110

111
112
113
114
115
116
117
118
119
120
121
122
123
124
#if REALCASE == 1
      real(kind=REAL_DATATYPE), intent(out)         :: tau(na)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), intent(out)   :: tau(na)
#endif
#if REALCASE == 1
#ifdef USE_ASSUMED_SIZE
      real(kind=REAL_DATATYPE), intent(inout)       :: a_mat(lda,*)
#else
      real(kind=REAL_DATATYPE), intent(inout)       :: a_mat(lda,matrixCols)
#endif
#endif
#if COMPLEXCASE == 1
Pavel Kus's avatar
Pavel Kus committed
125
#ifdef USE_ASSUMED_SIZE
126
      complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,*)
Pavel Kus's avatar
Pavel Kus committed
127
#else
128
129
      complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,matrixCols)
#endif
Pavel Kus's avatar
Pavel Kus committed
130
#endif
131
      real(kind=REAL_DATATYPE), intent(out)         :: d_vec(na), e_vec(na)
Pavel Kus's avatar
Pavel Kus committed
132

133
      integer(kind=ik), parameter                   :: max_stored_uv = 32
134
      logical,          parameter                   :: mat_vec_as_one_block = .true.
Pavel Kus's avatar
Pavel Kus committed
135
136

      ! id in processor row and column and total numbers of processor rows and columns
Andreas Marek's avatar
Andreas Marek committed
137
      integer(kind=ik)                              :: my_prow, my_pcol, np_rows, np_cols
138
139
140
      integer(kind=ik)                              :: mpierr
      integer(kind=ik)                              :: totalblocks, max_loc_block_rows, max_loc_block_cols, max_local_rows, &
                                                       max_local_cols
141
      ! updated after each istep (in the main cycle) to contain number of
Pavel Kus's avatar
Pavel Kus committed
142
      ! local columns and rows of the remaining part of the matrix
143
144
145
      !integer(kind=ik)                             :: l_cols, l_rows
      integer(kind=ik)                              :: l_cols, l_rows
      integer(kind=ik)                              :: n_stored_vecs
146

147
148
149
      integer(kind=C_intptr_T)                      :: a_dev, v_row_dev, v_col_dev, u_row_dev, u_col_dev, vu_stored_rows_dev, &
                                                       uv_stored_cols_dev
      logical                                       :: successCUDA
150

151
152
      integer(kind=ik)                              :: istep, i, j, l_col_beg, l_col_end, l_row_beg, l_row_end
      integer(kind=ik)                              :: tile_size, l_rows_per_tile, l_cols_per_tile
153
      integer(kind=c_intptr_t)                        :: a_offset
Pavel Kus's avatar
Pavel Kus committed
154
155

#ifdef WITH_OPENMP
156
157
      integer(kind=ik)                              :: my_thread, n_threads, max_threads, n_iter
      integer(kind=ik)                              :: omp_get_thread_num, omp_get_num_threads, omp_get_max_threads
Pavel Kus's avatar
Pavel Kus committed
158
159
#endif

160
161
162
163
164
165
166
      real(kind=REAL_DATATYPE)                      :: vnorm2
#if REALCASE == 1
      real(kind=REAL_DATATYPE)                      :: vav, x, aux(2*max_stored_uv), aux1(2), aux2(2), vrl, xf
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE)                :: vav, xc, aux(2*max_stored_uv),  aux1(2), aux2(2), aux3(1), vrl, xf
#endif
167

168
169
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: tmp(:), &
170
171
                                                       v_row(:), &   ! used to store calculated Householder Vector
                                                       v_col(:), &   ! the same Vector, but transposed - differently distributed among MPI tasks
172
173
174
175
176
177
                                                       u_row(:), &
                                                       u_col(:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable   :: tmp(:), v_row(:), v_col(:), u_row(:), u_col(:)
#endif
178
      ! the following two matrices store pairs of vectors v and u calculated in each step
179
180
      ! at most max_stored_uv Vector pairs are stored, than the matrix A_i is explicitli updated
      ! u and v are stored both in row and Vector forms
181
182
      ! pattern: v1,u1,v2,u2,v3,u3,....
      ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
183
184
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: vu_stored_rows(:,:)
185
      ! pattern: u1,v1,u2,v2,u3,v3,....
186
187
188
189
190
      real(kind=REAL_DATATYPE), allocatable         :: uv_stored_cols(:,:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable   :: vu_stored_rows(:,:), uv_stored_cols(:,:)
#endif
191

Pavel Kus's avatar
Pavel Kus committed
192
#ifdef WITH_OPENMP
193
194
195
196
197
198
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: ur_p(:,:), uc_p(:,:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable   :: ur_p(:,:), uc_p(:,:)
#endif
Pavel Kus's avatar
Pavel Kus committed
199
200
#endif

201
202
203
204
205
#if COMPLEXCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: tmp_real(:)
#endif
      integer(kind=ik)                              :: istat
      character(200)                                :: errorMessage
206
      character(20)                                 :: gpuString
207
      integer(kind=c_intptr_t), parameter             :: size_of_datatype = size_of_&
208
209
210
                                                                          &PRECISION&
                                                                          &_&
                                                                          &MATH_DATATYPE
211
212
213
214
215
216
      if(useGPU) then
        gpuString = "_gpu"
      else
        gpuString = ""
      endif

217
      call obj%timer%start("tridiag_&
218
      &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
219
      &" // &
220
221
      PRECISION_SUFFIX // &
      gpuString )
222

223

224
      if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
225
226
227
228
      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
229
      if (wantDebug) call obj%timer%stop("mpi_communication")
230

Pavel Kus's avatar
Pavel Kus committed
231
      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
232
233
234
      ! seems that tile is a square submatrix, consisting by several blocks
      ! it is a smallest possible square submatrix, where blocks being distributed among
      ! processors are "aligned" in both rows and columns
235
      !  -----------------
236
237
238
239
240
241
242
243
244
245
246
247
      ! | 1 4 | 1 4 | 1 4 | ...
      ! | 2 5 | 2 5 | 2 5 | ...
      ! | 3 6 | 3 6 | 3 6 | ...
      !  ----------------- ...
      ! | 1 4 | 1 4 | 1 4 | ...
      ! | 2 5 | 2 5 | 2 5 | ...
      ! | 3 6 | 3 6 | 3 6 | ...
      !  ----------------- .
      !   : :   : :   : :    .
      !   : :   : :   : :      .
      !
      ! this is a tile, where each number represents block, assigned to a processor with the shown number
248
      ! size of this small block is nblk
249
      ! Image is for situation with 6 processors, 3 processor rows and 2 columns
250
      ! tile_size is thus nblk * 6
251
      !
Pavel Kus's avatar
Pavel Kus committed
252
253
      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
254

255
256
      l_rows_per_tile = tile_size/np_rows ! local rows of a tile
      l_cols_per_tile = tile_size/np_cols ! local cols of a tile
257

Pavel Kus's avatar
Pavel Kus committed
258
      totalblocks = (na-1)/nblk + 1
259
260
      max_loc_block_rows = (totalblocks-1)/np_rows + 1
      max_loc_block_cols = (totalblocks-1)/np_cols + 1
Pavel Kus's avatar
Pavel Kus committed
261

262
263
264
      ! localy owned submatrix has size at most max_local_rows x max_local_cols at each processor
      max_local_rows = max_loc_block_rows*nblk
      max_local_cols = max_loc_block_cols*nblk
265

266
267
268
      ! allocate memmory for vectors
      ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
      ! todo: if something has length max_local_rows, it is actually a column, no?
269
      ! todo: probably one should read it as v_row = Vector v distributed among rows
270
      !
Pavel Kus's avatar
Pavel Kus committed
271
      allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
272
273
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "tmp", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
274

275
      ! allocate v_row 1 element longer to allow store and broadcast tau together with it
276
      allocate(v_row(max_local_rows+1), stat=istat, errmsg=errorMessage)
277
278
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "v_row", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
279

280
      allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage)
281
282
283
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "u_row", istat, errorMessage)

284
      allocate(v_col(max_local_cols), stat=istat, errmsg=errorMessage)
285
286
287
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "v_col", istat, errorMessage)

288
      allocate(u_col(max_local_cols), stat=istat, errmsg=errorMessage)
289
290
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "u_col", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
291
292
293
294
295

#ifdef WITH_OPENMP
      max_threads = omp_get_max_threads()

      allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
296
297
298
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "ur_p", istat, errorMessage)

Pavel Kus's avatar
Pavel Kus committed
299
      allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
300
301
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "uc_p", istat, errorMessage)
302
#endif /* WITH_OPENMP */
Pavel Kus's avatar
Pavel Kus committed
303
304

      tmp = 0
305
306
307
308
      v_row = 0
      u_row = 0
      v_col = 0
      u_col = 0
Pavel Kus's avatar
Pavel Kus committed
309

310
      allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
311
312
313
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)

314
      allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
315
316
317
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage)

318
      if (useGPU) then
319
         successCUDA = cuda_malloc(v_row_dev, max_local_rows * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
320
         check_alloc_cuda("tridiag: v_row_dev", successCUDA)
321

322
323
         successCUDA = cuda_malloc(u_row_dev, max_local_rows * size_of_datatype)

Andreas Marek's avatar
Andreas Marek committed
324
         check_alloc_cuda("tridiag: u_row_dev", successCUDA)
325

326
         successCUDA = cuda_malloc(v_col_dev, max_local_cols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
327
         check_alloc_cuda("tridiag: v_col_dev", successCUDA)
328

329
         successCUDA = cuda_malloc(u_col_dev, max_local_cols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
330
         check_alloc_cuda("tridiag: u_col_dev", successCUDA)
331

332
         successCUDA = cuda_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
333
         check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
334

335
         successCUDA = cuda_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
336
         check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
337
      endif !useGPU
Pavel Kus's avatar
Pavel Kus committed
338

339

Pavel Kus's avatar
Pavel Kus committed
340
341
      d_vec(:) = 0
      e_vec(:) = 0
Pavel Kus's avatar
Pavel Kus committed
342
343
      tau(:) = 0

344
      n_stored_vecs = 0
Pavel Kus's avatar
Pavel Kus committed
345

Pavel Kus's avatar
Pavel Kus committed
346
347
      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a_mat
      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a_mat
348

349
      if (my_prow == prow(na, nblk, np_rows) .and. my_pcol == pcol(na, nblk, np_cols)) &
350
351
352
353
#if COMPLEXCASE == 1
        d_vec(na) = real(a_mat(l_rows,l_cols), kind=rk)
#endif
#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
354
        d_vec(na) = a_mat(l_rows,l_cols)
355
#endif
356

357
358
      if (useGPU) then
        ! allocate memmory for matrix A on the device and than copy the matrix
359

360
        successCUDA = cuda_malloc(a_dev, lda * matrixCols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
361
        check_alloc_cuda("tridiag: a_dev", successCUDA)
362

363
        successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), lda * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
Pavel Kus's avatar
Pavel Kus committed
364
        check_memcpy_cuda("tridiag: a_dev", successCUDA)
365
      endif
Pavel Kus's avatar
Pavel Kus committed
366

367
      ! main cycle of tridiagonalization
368
      ! in each step, 1 Householder Vector is calculated
369
      do istep = na, 3 ,-1
Pavel Kus's avatar
Pavel Kus committed
370

371
372
373
374
        ! Calculate number of local rows and columns of the still remaining matrix
        ! on the local processor
        l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1)
        l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1)
Pavel Kus's avatar
Pavel Kus committed
375

376
        ! Calculate Vector for Householder transformation on all procs
377
        ! owning column istep
Pavel Kus's avatar
Pavel Kus committed
378

379
        if (my_pcol == pcol(istep, nblk, np_cols)) then
Pavel Kus's avatar
Pavel Kus committed
380

381
          ! Get Vector to be transformed; distribute last element and norm of
382
          ! remaining elements to all procs in current column
Pavel Kus's avatar
Pavel Kus committed
383

384
385
          ! copy l_cols + 1 column of A to v_row
          if (useGPU) then
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
386
            a_offset = l_cols * lda * size_of_datatype
387
            ! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
388

389
            successCUDA = cuda_memcpy(loc(v_row(1)), a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
390
            check_memcpy_cuda("tridiag a_dev 1", successCUDA)
391
392
393
          else
            v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
          endif
394

395
            if (n_stored_vecs > 0 .and. l_rows > 0) then
396
              if (wantDebug) call obj%timer%start("blas")
397
398
399
400
#if COMPLEXCASE == 1
              aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
#endif
              call PRECISION_GEMV('N',   &
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
401
                                  l_rows, 2*n_stored_vecs,                                  &
402
                                  ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1),        &
403
#if REALCASE == 1
404
                                  uv_stored_cols(l_cols+1,1), ubound(uv_stored_cols,dim=1), &
405
406
#endif
#if COMPLEXCASE == 1
407
408
                                   aux, 1,  &

409
#endif
410
                                  ONE, v_row, 1)
411
               if (wantDebug) call obj%timer%stop("blas")
412

413
            endif
414

415
            if(my_prow == prow(istep-1, nblk, np_rows)) then
416
417
               aux1(1) = dot_product(v_row(1:l_rows-1),v_row(1:l_rows-1))
               aux1(2) = v_row(l_rows)
Pavel Kus's avatar
Pavel Kus committed
418
            else
419
               aux1(1) = dot_product(v_row(1:l_rows),v_row(1:l_rows))
Pavel Kus's avatar
Pavel Kus committed
420
421
422
423
               aux1(2) = 0.
            endif

#ifdef WITH_MPI
424
            if (wantDebug) call obj%timer%start("mpi_communication")
425
426
            call mpi_allreduce(aux1, aux2, 2, MPI_MATH_DATATYPE_PRECISION, &
                               MPI_SUM, mpi_comm_rows, mpierr)
427
            if (wantDebug) call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
428
#else /* WITH_MPI */
429
          aux2 = aux1
Pavel Kus's avatar
Pavel Kus committed
430
#endif /* WITH_MPI */
431
432

#if REALCASE == 1
433
          vnorm2 = aux2(1)
434
435
436
437
#endif
#if COMPLEXCASE == 1
          vnorm2 = real(aux2(1),kind=rk)
#endif
438
          vrl    = aux2(2)
Pavel Kus's avatar
Pavel Kus committed
439

440
          ! Householder transformation
441
#if REALCASE == 1
442
          call hh_transform_real_&
443
444
#endif
#if COMPLEXCASE == 1
445
          call hh_transform_complex_&
446
#endif
Andreas Marek's avatar
cleanup    
Andreas Marek committed
447
               &PRECISION &
448
                             (obj, vrl, vnorm2, xf, tau(istep), wantDebug)
449
          ! Scale v_row and store Householder Vector for back transformation
Pavel Kus's avatar
Pavel Kus committed
450

451
          v_row(1:l_rows) = v_row(1:l_rows) * xf
452
          if (my_prow == prow(istep-1, nblk, np_rows)) then
453
454
455
            v_row(l_rows) = 1.

            ! vrl is newly computed off-diagonal element of the final tridiagonal matrix
456
#if REALCASE == 1
457
            e_vec(istep-1) = vrl
458
459
460
461
#endif
#if COMPLEXCASE == 1
            e_vec(istep-1) = real(vrl,kind=rk)
#endif
462
463
          endif

464
          ! store Householder Vector for back transformation
465
          a_mat(1:l_rows,l_cols+1) = v_row(1:l_rows)
466

467
          ! add tau after the end of actuall v_row, to be broadcasted with it
468
          v_row(l_rows+1) = tau(istep)
469
         endif !(my_pcol == pcol(istep, nblk, np_cols))
470

471
!          SAVE_MATR("HH vec stored", na - istep + 1)
Pavel Kus's avatar
Pavel Kus committed
472
473

#ifdef WITH_MPI
474
         if (wantDebug) call obj%timer%start("mpi_communication")
475
         ! Broadcast the Householder Vector (and tau) along columns
476
         call MPI_Bcast(v_row, l_rows+1, MPI_MATH_DATATYPE_PRECISION,    &
477
                        pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
478
         if (wantDebug) call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
479
480
#endif /* WITH_MPI */

481
482
        !recover tau, which has been broadcasted together with v_row
        tau(istep) =  v_row(l_rows+1)
483

484
        ! Transpose Householder Vector v_row -> v_col
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
485
        call elpa_transpose_vectors_&
Andreas Marek's avatar
cleanup    
Andreas Marek committed
486
487
488
             &MATH_DATATYPE&
             &_&
             &PRECISION &
489
                   (obj, v_row, ubound(v_row,dim=1), mpi_comm_rows, v_col, ubound(v_col,dim=1), mpi_comm_cols, &
490
                    1, istep-1, 1, nblk)
491

492
        ! Calculate u = (A + VU**T + UV**T)*v
Pavel Kus's avatar
Pavel Kus committed
493

494
495
        ! For cache efficiency, we use only the upper half of the matrix tiles for this,
        ! thus the result is partly in u_col(:) and partly in u_row(:)
Pavel Kus's avatar
Pavel Kus committed
496

497
498
        u_col(1:l_cols) = 0
        u_row(1:l_rows) = 0
499
        if (l_rows > 0 .and. l_cols> 0 ) then
500
          if(useGPU) then
501
            successCUDA = cuda_memset(u_col_dev, 0, l_cols * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
502
            check_memcpy_cuda("tridiag: u_col_dev", successCUDA)
503

504
            successCUDA = cuda_memset(u_row_dev, 0, l_rows * size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
505
            check_memcpy_cuda("tridiag: u_row_dev", successCUDA)
506

507
            successCUDA = cuda_memcpy(v_col_dev, loc(v_col(1)), l_cols * size_of_datatype, cudaMemcpyHostToDevice)
508

Andreas Marek's avatar
Andreas Marek committed
509
            check_memcpy_cuda("tridiag: v_col_dev", successCUDA)
510

511
            successCUDA = cuda_memcpy(v_row_dev, loc(v_row(1)), l_rows * size_of_datatype, cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
512
            check_memcpy_cuda("tridiag: v_row_dev", successCUDA)
513
514
          endif ! useGU

Pavel Kus's avatar
Pavel Kus committed
515
#ifdef WITH_OPENMP
516
          call obj%timer%start("OpenMP parallel")
517
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
Pavel Kus's avatar
Pavel Kus committed
518

519
520
          my_thread = omp_get_thread_num()
          n_threads = omp_get_num_threads()
Pavel Kus's avatar
Pavel Kus committed
521

522
          n_iter = 0
Pavel Kus's avatar
Pavel Kus committed
523

524
          ! first calculate A*v part of (A + VU**T + UV**T)*v
525
526
          uc_p(1:l_cols,my_thread) = 0.
          ur_p(1:l_rows,my_thread) = 0.
527
#endif /* WITH_OPENMP */
528
          do i= 0, (istep-2)/tile_size
529
530
            l_col_beg = i*l_cols_per_tile+1
            l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
531
532
            if (l_col_end < l_col_beg) cycle
            do j = 0, i
533
534
              l_row_beg = j*l_rows_per_tile+1
              l_row_end = min(l_rows,(j+1)*l_rows_per_tile)
535
              if (l_row_end < l_row_beg) cycle
Pavel Kus's avatar
Pavel Kus committed
536
#ifdef WITH_OPENMP
537
              if (mod(n_iter,n_threads) == my_thread) then
538
                if (wantDebug) call obj%timer%start("blas")
539
540
                call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
                                    l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
541
542
543
                                    ONE, a_mat(l_row_beg,l_col_beg), lda,         &
                                    v_row(l_row_beg), 1, ONE, uc_p(l_col_beg,my_thread), 1)

544
                if (i/=j) then
545
546
547
548
                  call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1,          &
                                      ONE, a_mat(l_row_beg,l_col_beg), lda, v_col(l_col_beg), 1,  &

                                      ONE, ur_p(l_row_beg,my_thread), 1)
549
                endif
550
                if (wantDebug) call obj%timer%stop("blas")
551
552
              endif
              n_iter = n_iter+1
Pavel Kus's avatar
Pavel Kus committed
553
#else /* WITH_OPENMP */
554

555
556
              ! multiplication by blocks is efficient only for CPU
              ! for GPU we introduced 2 other ways, either by stripes (more simmilar to the original
557
              ! CPU implementation) or by one large matrix Vector multiply
558
              if (.not. useGPU) then
559
                if (wantDebug) call obj%timer%start("blas")
560
561
562
563
564
                call PRECISION_GEMV(BLAS_TRANS_OR_CONJ,  &
                            l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
                            ONE, a_mat(l_row_beg, l_col_beg), lda,         &
                            v_row(l_row_beg), 1,                           &
                            ONE, u_col(l_col_beg), 1)
565
566

                if (i/=j) then
567

568
                  call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1,  &
569
570
571
                                      ONE, a_mat(l_row_beg,l_col_beg), lda,               &
                                      v_col(l_col_beg), 1,                                &
                                      ONE, u_row(l_row_beg), 1)
572
                endif
573
                if (wantDebug) call obj%timer%stop("blas")
574
              endif ! not useGPU
Pavel Kus's avatar
Pavel Kus committed
575
576

#endif /* WITH_OPENMP */
577
            enddo  ! j=0,i
578
579
          enddo  ! i=0,(istep-2)/tile_size

580
          if (useGPU) then
581
582
583
584
            if(mat_vec_as_one_block) then
              ! Unlike for CPU, we (for each MPI thread) do just one large mat-vec multiplication
              ! this requires altering of the algorithm when later explicitly updating the matrix
              ! after max_stored_uv is reached : we need to update all tiles, not only those above diagonal
585
              if (wantDebug) call obj%timer%start("cublas")
586
587
588
589
590
591
592
593
594
595
596
597
598
599
              call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, l_rows,l_cols,  &
                                        ONE, a_dev, lda,                   &
                                        v_row_dev , 1,                          &
                                        ONE, u_col_dev, 1)

       ! todo: try with non transposed!!!
!                 if(i/=j) then
!                   call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,  &
!                                             ONE, a_dev + a_offset, lda,                        &
!                                             v_col_dev + (l_col_beg - 1) *                      &
!                                             size_of_datatype, 1,                          &
!                                             ONE, u_row_dev + (l_row_beg - 1) *                 &
!                                             size_of_datatype, 1)
!                 endif
600
              if (wantDebug) call obj%timer%stop("cublas")
601

Andreas Marek's avatar
Andreas Marek committed
602
603
            else
              !perform multiplication by stripes - it is faster than by blocks, since we call cublas with
604
605
606
607
608
              !larger matrices. In general, however, this algorithm is very simmilar to the one with CPU
              do i=0,(istep-2)/tile_size
                  l_col_beg = i*l_cols_per_tile+1
                  l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
                  if(l_col_end<l_col_beg) cycle
Andreas Marek's avatar
Andreas Marek committed
609

610
611
                  l_row_beg = 1
                  l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
Andreas Marek's avatar
Andreas Marek committed
612

613
614
                  a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
                          size_of_datatype
Andreas Marek's avatar
Andreas Marek committed
615

616
617
618
619
620
621
                  call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
                              l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
                              ONE, a_dev + a_offset, lda,  &
                              v_row_dev + (l_row_beg - 1) * size_of_datatype, 1,  &
                              ONE, u_col_dev + (l_col_beg - 1) * size_of_datatype, 1)
              enddo
Andreas Marek's avatar
Andreas Marek committed
622

623
624
625
626
              do i=0,(istep-2)/tile_size
                  l_col_beg = i*l_cols_per_tile+1
                  l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
                  if(l_col_end<l_col_beg) cycle
Andreas Marek's avatar
Andreas Marek committed
627

628
629
630
631
632
                  l_row_beg = 1
                  l_row_end = min(l_rows,i*l_rows_per_tile)

                  a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
                          size_of_datatype
Andreas Marek's avatar
Andreas Marek committed
633
634

                  call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
635
636
637
                              ONE, a_dev + a_offset, lda, &
                              v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
                              ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
Andreas Marek's avatar
Andreas Marek committed
638
639
640
              enddo
            end if !multiplication as one block / per stripes

641
            successCUDA = cuda_memcpy(loc(u_col(1)), u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
642
            check_memcpy_cuda("tridiag: u_col_dev 1", successCUDA)
643

644
            successCUDA = cuda_memcpy(loc(u_row(1)), u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost)
Andreas Marek's avatar
Andreas Marek committed
645
            check_memcpy_cuda("tridiag: u_row_dev 1", successCUDA)
646
          endif
647

648
!              call PRECISION_SYMV('U', l_cols,  &
Pavel Kus's avatar
Pavel Kus committed
649
!                         1.d0, a_mat, ubound(a_mat,1),  &
650
!                         v_row, 1,  &
651
!                         0.d0, u_col, 1)
652
653
654

!            endif ! useGPU

Pavel Kus's avatar
Pavel Kus committed
655
656
#ifdef WITH_OPENMP
!$OMP END PARALLEL
657
          call obj%timer%stop("OpenMP parallel")
Pavel Kus's avatar
Pavel Kus committed
658

659
660
661
662
663
          do i=0,max_threads-1
            u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i)
            u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i)
          enddo
#endif /* WITH_OPENMP */
Pavel Kus's avatar
Pavel Kus committed
664

Andreas Marek's avatar
Andreas Marek committed
665
          ! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v
666
          if (n_stored_vecs > 0) then
667
            if (wantDebug) call obj%timer%start("blas")
668
#if REALCASE == 1
669
            call PRECISION_GEMV('T',     &
670
671
#endif
#if COMPLEXCASE == 1
672
            call PRECISION_GEMV('C',     &
673
#endif
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
674
                                l_rows, 2*n_stored_vecs,   &
675
676
677
678
679
680
                                ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1),   &
                                v_row,  1, ZERO, aux, 1)

            call PRECISION_GEMV('N', l_cols, 2*n_stored_vecs,   &
                                ONE, uv_stored_cols, ubound(uv_stored_cols,dim=1),   &
                                aux, 1, ONE, u_col,  1)
681
            if (wantDebug) call obj%timer%stop("blas")
682
          endif
Pavel Kus's avatar
Pavel Kus committed
683

684
        endif  ! (l_rows>0 .and. l_cols>0)
Pavel Kus's avatar
Pavel Kus committed
685

686
        ! Sum up all u_row(:) parts along rows and add them to the u_col(:) parts
Pavel Kus's avatar
Pavel Kus committed
687
        ! on the processors containing the diagonal
688
        ! This is only necessary if u_row has been calculated, i.e. if the
Pavel Kus's avatar
Pavel Kus committed
689
690
691
        ! global tile size is smaller than the global remaining matrix

        if (tile_size < istep-1) then
692

Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
693
694
695
696
          call elpa_reduce_add_vectors_&
          &MATH_DATATYPE&
          &_&
          &PRECISION &
697
          (obj, u_row, ubound(u_row,dim=1), mpi_comm_rows, u_col, ubound(u_col,dim=1), &
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
698
           mpi_comm_cols, istep-1, 1, nblk)
699

Pavel Kus's avatar
Pavel Kus committed
700
701
        endif

702
        ! Sum up all the u_col(:) parts, transpose u_col -> u_row
Pavel Kus's avatar
Pavel Kus committed
703
704

        if (l_cols>0) then
705
          tmp(1:l_cols) = u_col(1:l_cols)
Pavel Kus's avatar
Pavel Kus committed
706
#ifdef WITH_MPI
707
          if (wantDebug) call obj%timer%start("mpi_communication")
708
          call mpi_allreduce(tmp, u_col, l_cols, MPI_MATH_DATATYPE_PRECISION,    &
709
                             MPI_SUM, mpi_comm_rows, mpierr)
710
          if (wantDebug) call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
711
#else /* WITH_MPI */
712
          u_col = tmp
Pavel Kus's avatar
Pavel Kus committed
713
714
715
#endif /* WITH_MPI */
        endif

716
        call elpa_transpose_vectors_&
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
717
718
719
        &MATH_DATATYPE&
        &_&
        &PRECISION &
720
        (obj, u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
721
         mpi_comm_rows, 1, istep-1, 1, nblk)
722

Pavel Kus's avatar
Pavel Kus committed
723
        ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
724
#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
725
        x = 0
726
        if (l_cols>0)  &
727
           x = dot_product(v_col(1:l_cols),u_col(1:l_cols))
728
729
730
731
732
733
734
#endif
#if COMPLEXCASE == 1
        xc = 0
        if (l_cols>0)  &
           xc = dot_product(v_col(1:l_cols),u_col(1:l_cols))
#endif

Pavel Kus's avatar
Pavel Kus committed
735
#ifdef WITH_MPI
736
        if (wantDebug) call obj%timer%start("mpi_communication")
737
#if REALCASE == 1
738
        call mpi_allreduce(x, vav, 1, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
739
740
#endif
#if COMPLEXCASE == 1
741
        call mpi_allreduce(xc, vav, 1 , MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
742
#endif
743
        if (wantDebug) call obj%timer%stop("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
744
#else /* WITH_MPI */
745
746

#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
747
        vav = x
748
749
750
751
752
#endif
#if COMPLEXCASE == 1
        vav = xc
#endif

Pavel Kus's avatar
Pavel Kus committed
753
754
755
756
757
758
#endif /* WITH_MPI */

        ! store u and v in the matrices U and V
        ! these matrices are stored combined in one here

        do j=1,l_rows
759
#if REALCASE == 1
760
761
          vu_stored_rows(j,2*n_stored_vecs+1) = tau(istep)*v_row(j)
          vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*tau(istep)*vav*v_row(j) - u_row(j)
762
763
764
765
766
#endif
#if COMPLEXCASE == 1
          vu_stored_rows(j,2*n_stored_vecs+1) = conjg(tau(istep))*v_row(j)
          vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*conjg(tau(istep))*vav*v_row(j) - u_row(j)
#endif
Pavel Kus's avatar
Pavel Kus committed
767
768
        enddo
        do j=1,l_cols
769
#if REALCASE == 1
770
771
          uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*tau(istep)*vav*v_col(j) - u_col(j)
          uv_stored_cols(j,2*n_stored_vecs+2) = tau(istep)*v_col(j)
772
773
774
775
776
#endif
#if COMPLEXCASE == 1
          uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*conjg(tau(istep))*vav*v_col(j) - u_col(j)
          uv_stored_cols(j,2*n_stored_vecs+2) = conjg(tau(istep))*v_col(j)
#endif
Pavel Kus's avatar
Pavel Kus committed
777
778
        enddo

779
        ! We have calculated another Hauseholder Vector, number of implicitly stored increased
780
781
782
        n_stored_vecs = n_stored_vecs+1

        ! If the limit of max_stored_uv is reached, calculate A + VU**T + UV**T
783
        if (n_stored_vecs == max_stored_uv .or. istep == 3) then
784
785
786

          if (useGPU) then
            successCUDA = cuda_memcpy(vu_stored_rows_dev, loc(vu_stored_rows(1,1)), &
787
                                      max_local_rows * 2 * max_stored_uv *          &
788
                                      size_of_datatype, cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
789
            check_memcpy_cuda("tridiag: vu_stored_rows_dev", successCUDA)
790

791
            successCUDA = cuda_memcpy(uv_stored_cols_dev, loc(uv_stored_cols(1,1)), &
792
                                      max_local_cols * 2 * max_stored_uv *          &
793
                                      size_of_datatype, cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
794
            check_memcpy_cuda("tridiag: uv_stored_cols_dev", successCUDA)
795
          endif
Pavel Kus's avatar
Pavel Kus committed
796

797
          do i = 0, (istep-2)/tile_size
798
            ! go over tiles above (or on) the diagonal
799
800
801
802
803
            l_col_beg = i*l_cols_per_tile+1
            l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
            l_row_beg = 1
            l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
            if (l_col_end<l_col_beg .or. l_row_end<l_row_beg) &
804
              cycle
805

Andreas Marek's avatar
Andreas Marek committed
806

807
            if (useGPU) then
808
809
810
              if(.not. mat_vec_as_one_block) then
                ! if using mat-vec multiply by stripes, it is enough to update tiles above (or on) the diagonal only
                ! we than use the same calls as for CPU version