elpa1_tridiag_template.X90 46.1 KB
Newer Older
Pavel Kus's avatar
Pavel Kus committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

55
56
57
58
59
60
61

!> \brief Reduces a distributed symmetric matrix to tridiagonal form (like Scalapack Routine PDSYTRD)
!>
!  Parameters
!
!> \param na          Order of matrix
!>
62
!> \param a_mat(lda,matrixCols)    Distributed matrix which should be reduced.
63
64
65
66
67
68
69
70
71
72
73
74
75
!>              Distribution is like in Scalapack.
!>              Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
!>              a(:,:) is overwritten on exit with the Householder vectors
!>
!> \param lda         Leading dimension of a
!>
!> \param nblk        blocksize of cyclic distribution, must be the same in both directions!
!>
!> \param matrixCols  local columns of matrix
!>
!> \param mpi_comm_rows        MPI-Communicator for rows
!> \param mpi_comm_cols        MPI-Communicator for columns
!>
76
!> \param d_vec(na)       Diagonal elements (returned), identical on all processors
77
!>
78
!> \param e_vec(na)       Off-Diagonal elements (returned), identical on all processors
79
80
!>
!> \param tau(na)     Factors for the Householder vectors (returned), needed for back transformation
Pavel Kus's avatar
Pavel Kus committed
81
82
!>
!> \param useGPU      If true,  GPU version of the subroutine will be used
83
!>
84

85
86
87
88
89
    subroutine tridiag_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
    (na, a_mat, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU)
90
91
      use cuda_functions
      use iso_c_binding
Pavel Kus's avatar
Pavel Kus committed
92
93
#ifdef HAVE_DETAILED_TIMINGS
      use timings
94
95
#else
      use timings_dummy
Pavel Kus's avatar
Pavel Kus committed
96
97
98
99
#endif
      use precision
      implicit none

100
101
      integer(kind=ik), intent(in)                  :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
      logical, intent(in)                           :: useGPU
102

103
104
105
106
107
108
109
110
111
112
113
114
115
116
#if REALCASE == 1
      real(kind=REAL_DATATYPE), intent(out)         :: tau(na)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), intent(out)   :: tau(na)
#endif
#if REALCASE == 1
#ifdef USE_ASSUMED_SIZE
      real(kind=REAL_DATATYPE), intent(inout)       :: a_mat(lda,*)
#else
      real(kind=REAL_DATATYPE), intent(inout)       :: a_mat(lda,matrixCols)
#endif
#endif
#if COMPLEXCASE == 1
Pavel Kus's avatar
Pavel Kus committed
117
#ifdef USE_ASSUMED_SIZE
118
      complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,*)
Pavel Kus's avatar
Pavel Kus committed
119
#else
120
121
      complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,matrixCols)
#endif
Pavel Kus's avatar
Pavel Kus committed
122
#endif
123
      real(kind=REAL_DATATYPE), intent(out)         :: d_vec(na), e_vec(na)
Pavel Kus's avatar
Pavel Kus committed
124

125
      integer(kind=ik), parameter                   :: max_stored_uv = 32
Pavel Kus's avatar
Pavel Kus committed
126
127

      ! id in processor row and column and total numbers of processor rows and columns
128
129
      integer(kind=ik)                              :: my_prow, my_pcol, np_rows, np_cols, my_rank
      integer(kind=ik)                              :: mpierr
130
131
132
133
134
135
136
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
      real(kind=rk8), parameter                     :: ZERO=0.0_rk8, ONE = 1.0_rk8
#else
      real(kind=rk4), parameter                     :: ZERO=0.0_rk4, ONE = 1.0_rk4
#endif
#endif
137
138
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
139
      complex(kind=ck8), parameter                  :: ZERO = (0.0_rk8,0.0_rk8), ONE = (1.0_rk8,0.0_rk8)
140
#else
141
      complex(kind=ck4), parameter                  :: ZERO = (0.0_rk4,0.0_rk4), ONE = (1.0_rk4,0.0_rk4)
142
143
144
145
#endif
#endif
      integer(kind=ik)                              :: totalblocks, max_loc_block_rows, max_loc_block_cols, max_local_rows, &
                                                       max_local_cols
146
      ! updated after each istep (in the main cycle) to contain number of
Pavel Kus's avatar
Pavel Kus committed
147
      ! local columns and rows of the remaining part of the matrix
148
149
150
      !integer(kind=ik)                             :: l_cols, l_rows
      integer(kind=ik)                              :: l_cols, l_rows
      integer(kind=ik)                              :: n_stored_vecs
151

152
153
154
      integer(kind=C_intptr_T)                      :: a_dev, v_row_dev, v_col_dev, u_row_dev, u_col_dev, vu_stored_rows_dev, &
                                                       uv_stored_cols_dev
      logical                                       :: successCUDA
155

156
157
158
      integer(kind=ik)                              :: istep, i, j, l_col_beg, l_col_end, l_row_beg, l_row_end
      integer(kind=ik)                              :: tile_size, l_rows_per_tile, l_cols_per_tile
      integer(kind=c_size_t)                        :: a_offset
Pavel Kus's avatar
Pavel Kus committed
159
160

#ifdef WITH_OPENMP
161
162
      integer(kind=ik)                              :: my_thread, n_threads, max_threads, n_iter
      integer(kind=ik)                              :: omp_get_thread_num, omp_get_num_threads, omp_get_max_threads
Pavel Kus's avatar
Pavel Kus committed
163
164
#endif

165
166
167
168
169
170
171
      real(kind=REAL_DATATYPE)                      :: vnorm2
#if REALCASE == 1
      real(kind=REAL_DATATYPE)                      :: vav, x, aux(2*max_stored_uv), aux1(2), aux2(2), vrl, xf
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE)                :: vav, xc, aux(2*max_stored_uv),  aux1(2), aux2(2), aux3(1), vrl, xf
#endif
172

173
174
175
176
177
178
179
180
181
182
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: tmp(:), &
                                                       v_row(:), &   ! used to store calculated Householder vector
                                                       v_col(:), &   ! the same vector, but transposed - differently distributed among MPI tasks
                                                       u_row(:), &
                                                       u_col(:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable   :: tmp(:), v_row(:), v_col(:), u_row(:), u_col(:)
#endif
183
184
      ! the following two matrices store pairs of vectors v and u calculated in each step
      ! at most max_stored_uv vector pairs are stored, than the matrix A_i is explicitli updated
185
      ! u and v are stored both in row and vector forms
186
187
      ! pattern: v1,u1,v2,u2,v3,u3,....
      ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
188
189
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: vu_stored_rows(:,:)
190
      ! pattern: u1,v1,u2,v2,u3,v3,....
191
192
193
194
195
      real(kind=REAL_DATATYPE), allocatable         :: uv_stored_cols(:,:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable   :: vu_stored_rows(:,:), uv_stored_cols(:,:)
#endif
196

Pavel Kus's avatar
Pavel Kus committed
197
#ifdef WITH_OPENMP
198
199
200
201
202
203
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: ur_p(:,:), uc_p(:,:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable   :: ur_p(:,:), uc_p(:,:)
#endif
Pavel Kus's avatar
Pavel Kus committed
204
205
#endif

206
207
208
209
210
#if COMPLEXCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: tmp_real(:)
#endif
      integer(kind=ik)                              :: istat
      character(200)                                :: errorMessage
Pavel Kus's avatar
Pavel Kus committed
211

212
213
214
215
216
217
      call timer%start("tridiag_&
      &MATH_DATATYPE&
      &_" // &
      PRECISION_SUFFIX &
      )

Pavel Kus's avatar
Pavel Kus committed
218
219
220
221
222
223
      call timer%start("mpi_communication")
      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
      call timer%stop("mpi_communication")
224

Pavel Kus's avatar
Pavel Kus committed
225
      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
226
227
228
      ! seems that tile is a square submatrix, consisting by several blocks
      ! it is a smallest possible square submatrix, where blocks being distributed among
      ! processors are "aligned" in both rows and columns
229
      !  -----------------
230
231
232
233
234
235
236
237
238
239
240
241
      ! | 1 4 | 1 4 | 1 4 | ...
      ! | 2 5 | 2 5 | 2 5 | ...
      ! | 3 6 | 3 6 | 3 6 | ...
      !  ----------------- ...
      ! | 1 4 | 1 4 | 1 4 | ...
      ! | 2 5 | 2 5 | 2 5 | ...
      ! | 3 6 | 3 6 | 3 6 | ...
      !  ----------------- .
      !   : :   : :   : :    .
      !   : :   : :   : :      .
      !
      ! this is a tile, where each number represents block, assigned to a processor with the shown number
242
      ! size of this small block is nblk
243
      ! Image is for situation with 6 processors, 3 processor rows and 2 columns
244
      ! tile_size is thus nblk * 6
245
      !
Pavel Kus's avatar
Pavel Kus committed
246
247
      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
248

249
250
      l_rows_per_tile = tile_size/np_rows ! local rows of a tile
      l_cols_per_tile = tile_size/np_cols ! local cols of a tile
251

Pavel Kus's avatar
Pavel Kus committed
252
      totalblocks = (na-1)/nblk + 1
253
254
      max_loc_block_rows = (totalblocks-1)/np_rows + 1
      max_loc_block_cols = (totalblocks-1)/np_cols + 1
Pavel Kus's avatar
Pavel Kus committed
255

256
257
258
      ! localy owned submatrix has size at most max_local_rows x max_local_cols at each processor
      max_local_rows = max_loc_block_rows*nblk
      max_local_cols = max_loc_block_cols*nblk
259

260
261
262
263
264
      ! allocate memmory for vectors
      ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
      ! todo: if something has length max_local_rows, it is actually a column, no?
      ! todo: probably one should read it as v_row = vector v distributed among rows
      !
Pavel Kus's avatar
Pavel Kus committed
265
      allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
266
267
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "tmp", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
268

269
      ! allocate v_row 1 element longer to allow store and broadcast tau together with it
270
      allocate(v_row(max_local_rows+1), stat=istat, errmsg=errorMessage)
271
272
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "v_row", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
273

274
      allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage)
275
276
277
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "u_row", istat, errorMessage)

278
      allocate(v_col(max_local_cols), stat=istat, errmsg=errorMessage)
279
280
281
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "v_col", istat, errorMessage)

282
      allocate(u_col(max_local_cols), stat=istat, errmsg=errorMessage)
283
284
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "u_col", istat, errorMessage)
Pavel Kus's avatar
Pavel Kus committed
285
286
287
288
289

#ifdef WITH_OPENMP
      max_threads = omp_get_max_threads()

      allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
290
291
292
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "ur_p", istat, errorMessage)

Pavel Kus's avatar
Pavel Kus committed
293
      allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
294
295
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "uc_p", istat, errorMessage)
296
#endif /* WITH_OPENMP */
Pavel Kus's avatar
Pavel Kus committed
297
298

      tmp = 0
299
300
301
302
      v_row = 0
      u_row = 0
      v_col = 0
      u_col = 0
Pavel Kus's avatar
Pavel Kus committed
303

304
      allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
305
306
307
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)

308
      allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
309
310
311
      call check_alloc("tridiag_&
      &MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage)

312
      if (useGPU) then
313
         successCUDA = cuda_malloc(v_row_dev, max_local_rows * &
314
#if REALCASE == 1
315
                                   size_of_PRECISION_real)
316
317
#endif
#if COMPLEXCASE == 1
318
                                   size_of_PRECISION_complex)
319
#endif
Pavel Kus's avatar
Pavel Kus committed
320
         check_alloc_cuda("tridiag", successCUDA)
321

322
         successCUDA = cuda_malloc(u_row_dev, max_local_rows * &
323
#if REALCASE == 1
324
                                   size_of_PRECISION_real)
325
326
#endif
#if COMPLEXCASE == 1
327
                                   size_of_PRECISION_complex)
328
#endif
Pavel Kus's avatar
Pavel Kus committed
329
         check_alloc_cuda("tridiag", successCUDA)
330

331
         successCUDA = cuda_malloc(v_col_dev, max_local_cols * &
332
#if REALCASE == 1
333
                                   size_of_PRECISION_real)
334
335
#endif
#if COMPLEXCASE == 1
336
                                   size_of_PRECISION_complex)
337
#endif
Pavel Kus's avatar
Pavel Kus committed
338
         check_alloc_cuda("tridiag", successCUDA)
339

340
         successCUDA = cuda_malloc(u_col_dev, max_local_cols * &
341
#if REALCASE == 1
342
                                   size_of_PRECISION_real)
343
344
#endif
#if COMPLEXCASE == 1
345
                                   size_of_PRECISION_complex)
346
#endif
Pavel Kus's avatar
Pavel Kus committed
347
         check_alloc_cuda("tridiag", successCUDA)
348

349
         successCUDA = cuda_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * &
350
#if REALCASE == 1
351
                                   size_of_PRECISION_real)
352
353
#endif
#if COMPLEXCASE == 1
354
                                   size_of_PRECISION_complex)
355
#endif
Pavel Kus's avatar
Pavel Kus committed
356
         check_alloc_cuda("tridiag", successCUDA)
357

358
         successCUDA = cuda_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * &
359
#if REALCASE == 1
360
                                   size_of_PRECISION_real)
361
362
#endif
#if COMPLEXCASE == 1
363
                                   size_of_PRECISION_complex)
364
#endif
Pavel Kus's avatar
Pavel Kus committed
365
         check_alloc_cuda("tridiag", successCUDA)
366
      endif !useGPU
Pavel Kus's avatar
Pavel Kus committed
367

368

Pavel Kus's avatar
Pavel Kus committed
369
370
      d_vec(:) = 0
      e_vec(:) = 0
Pavel Kus's avatar
Pavel Kus committed
371
372
      tau(:) = 0

373
      n_stored_vecs = 0
Pavel Kus's avatar
Pavel Kus committed
374

Pavel Kus's avatar
Pavel Kus committed
375
376
      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a_mat
      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a_mat
377
      if (my_prow == prow(na, nblk, np_rows) .and. my_pcol == pcol(na, nblk, np_cols)) &
Pavel Kus's avatar
Pavel Kus committed
378
        d_vec(na) = a_mat(l_rows,l_cols)
379

380
381
      if (useGPU) then
        ! allocate memmory for matrix A on the device and than copy the matrix
382
383

        successCUDA = cuda_malloc(a_dev, lda * matrixCols * &
384
#if REALCASE == 1
385
                                  size_of_PRECISION_real)
386
387
#endif
#if COMPLEXCASE == 1
388
                                  size_of_PRECISION_complex)
389
#endif
Pavel Kus's avatar
Pavel Kus committed
390
        check_alloc_cuda("tridiag", successCUDA)
391
392

        successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), lda * matrixCols * &
393
#if REALCASE == 1
394
                                  size_of_PRECISION_real, cudaMemcpyHostToDevice)
395
396
#endif
#if COMPLEXCASE == 1
397
                                  size_of_PRECISION_complex, cudaMemcpyHostToDevice)
398
#endif
Pavel Kus's avatar
Pavel Kus committed
399
        check_alloc_cuda("tridiag", successCUDA)
400
      endif
Pavel Kus's avatar
Pavel Kus committed
401

402
403
      ! main cycle of tridiagonalization
      ! in each step, 1 Householder vector is calculated
404
      do istep = na, 3 ,-1
Pavel Kus's avatar
Pavel Kus committed
405

406
407
408
409
        ! Calculate number of local rows and columns of the still remaining matrix
        ! on the local processor
        l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1)
        l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1)
Pavel Kus's avatar
Pavel Kus committed
410

411
412
        ! Calculate vector for Householder transformation on all procs
        ! owning column istep
Pavel Kus's avatar
Pavel Kus committed
413

414
        if (my_pcol == pcol(istep, nblk, np_cols)) then
Pavel Kus's avatar
Pavel Kus committed
415

416
417
          ! Get vector to be transformed; distribute last element and norm of
          ! remaining elements to all procs in current column
Pavel Kus's avatar
Pavel Kus committed
418

419
420
          ! copy l_cols + 1 column of A to v_row
          if (useGPU) then
421
	    a_offset = l_cols * lda * &
422
#if REALCASE == 1
423
                       size_of_PRECISION_real
424
425
#endif
#if COMPLEXCASE == 1
426
                       size_of_PRECISION_complex
427
#endif
428
            ! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
429

430
            successCUDA = cuda_memcpy(loc(v_row(1)), a_dev + a_offset, (l_rows)* &
431
#if REALCASE == 1
432
                                      size_of_PRECISION_real,  &
433
434
#endif
#if COMPLEXCASE == 1
435
                                      size_of_PRECISION_complex,  &
436
#endif
437
438
                                       cudaMemcpyDeviceToHost)

439
            check_memcpy_cuda("tridiag", successCUDA)
440
441
442
          else
            v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
          endif
443

444
445
            if (n_stored_vecs > 0 .and. l_rows > 0) then
	      call timer%start("blas")
446
447
448
449
450
451
#if COMPLEXCASE == 1
              aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
#endif
              call PRECISION_GEMV('N',   &
	                          l_rows, 2*n_stored_vecs,                                  &
                                  ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1),        &
452
#if REALCASE == 1
453
                                  uv_stored_cols(l_cols+1,1), ubound(uv_stored_cols,dim=1), &
454
455
#endif
#if COMPLEXCASE == 1
456
457
                                   aux, 1,  &

458
#endif
459
                                  ONE, v_row, 1)
460
461
	       call timer%stop("blas")

462
            endif
463

464
            if(my_prow == prow(istep-1, nblk, np_rows)) then
465
466
               aux1(1) = dot_product(v_row(1:l_rows-1),v_row(1:l_rows-1))
               aux1(2) = v_row(l_rows)
Pavel Kus's avatar
Pavel Kus committed
467
            else
468
               aux1(1) = dot_product(v_row(1:l_rows),v_row(1:l_rows))
Pavel Kus's avatar
Pavel Kus committed
469
470
471
472
473
               aux1(2) = 0.
            endif

#ifdef WITH_MPI
            call timer%start("mpi_communication")
474
            call mpi_allreduce(aux1, aux2, 2, &
475
#if REALCASE == 1
476
	    MPI_REAL_PRECISION,               &
477
478
#endif
#if COMPLEXCASE == 1
479
            MPI_COMPLEX_PRECISION,            &
480
#endif
481
	    MPI_SUM, mpi_comm_rows, mpierr)
Pavel Kus's avatar
Pavel Kus committed
482
483
            call timer%stop("mpi_communication")
#else /* WITH_MPI */
484
          aux2 = aux1
Pavel Kus's avatar
Pavel Kus committed
485
#endif /* WITH_MPI */
486
487
          vnorm2 = aux2(1)
          vrl    = aux2(2)
Pavel Kus's avatar
Pavel Kus committed
488

489
          ! Householder transformation
490
#if REALCASE == 1
491
          call hh_transform_real_&
492
493
#endif
#if COMPLEXCASE == 1
494
          call hh_transform_complex_&
495
#endif
496
497
&PRECISION &
                             (vrl, vnorm2, xf, tau(istep))
498
          ! Scale v_row and store Householder vector for back transformation
Pavel Kus's avatar
Pavel Kus committed
499

500
          v_row(1:l_rows) = v_row(1:l_rows) * xf
501
          if (my_prow == prow(istep-1, nblk, np_rows)) then
502
503
504
505
506
507
508
            v_row(l_rows) = 1.

            ! vrl is newly computed off-diagonal element of the final tridiagonal matrix
            e_vec(istep-1) = vrl
          endif

          ! store Householder vector for back transformation
509
          a_mat(1:l_rows,l_cols+1) = v_row(1:l_rows)
510

511
          ! add tau after the end of actuall v_row, to be broadcasted with it
512
          v_row(l_rows+1) = tau(istep)
513
         endif !(my_pcol == pcol(istep, nblk, np_cols))
Pavel Kus's avatar
Pavel Kus committed
514
515

#ifdef WITH_MPI
516
         ! Broadcast the Householder vector (and tau) along columns
517
         call MPI_Bcast(v_row, l_rows+1,       &
518
#if REALCASE == 1
519
                        MPI_REAL_PRECISION,    &
520
521
#endif
#if COMPLEXCASE == 1
522
                        MPI_COMPLEX_PRECISION, &
523
#endif
524
                        pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
Pavel Kus's avatar
Pavel Kus committed
525
526
#endif /* WITH_MPI */

527
528
        !recover tau, which has been broadcasted together with v_row
        tau(istep) =  v_row(l_rows+1)
529

530
        ! Transpose Householder vector v_row -> v_col
531
532
533
534
535
536
537
	call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
                                   (v_row, ubound(v_row,dim=1), mpi_comm_rows, v_col, ubound(v_col,dim=1), mpi_comm_cols, &
				    1, istep-1, 1, nblk)

538
        ! Calculate u = (A + VU**T + UV**T)*v
Pavel Kus's avatar
Pavel Kus committed
539

540
541
        ! For cache efficiency, we use only the upper half of the matrix tiles for this,
        ! thus the result is partly in u_col(:) and partly in u_row(:)
Pavel Kus's avatar
Pavel Kus committed
542

543
544
        u_col(1:l_cols) = 0
        u_row(1:l_rows) = 0
545
        if (l_rows > 0 .and. l_cols> 0 ) then
546
          if(useGPU) then
547
            successCUDA = cuda_memset(u_col_dev, 0, l_cols * &
548
#if REALCASE == 1
549
                                      size_of_PRECISION_real)
550
551
#endif
#if COMPLEXCASE == 1
552
                                      size_of_PRECISION_complex)
553
554
555
#endif
            check_memcpy_cuda("tridiag", successCUDA)

556
            successCUDA = cuda_memset(u_row_dev, 0, l_rows * &
557
#if REALCASE == 1
558
                                      size_of_PRECISION_real)
559
560
#endif
#if COMPLEXCASE == 1
561
                                      size_of_PRECISION_complex)
562
563
#endif
            check_memcpy_cuda("tridiag", successCUDA)
564

565
            successCUDA = cuda_memcpy(v_col_dev, loc(v_col(1)), l_cols * &
566
#if REALCASE == 1
567
                                      size_of_PRECISION_real,            &
568
569
#endif
#if COMPLEXCASE == 1
570
                                      size_of_PRECISION_complex,         &
571
#endif
572
573
                                      cudaMemcpyHostToDevice)

574
575
            check_memcpy_cuda("tridiag", successCUDA)

576
            successCUDA = cuda_memcpy(v_row_dev, loc(v_row(1)), l_rows * &
577
#if REALCASE == 1
578
                                      size_of_PRECISION_real,            &
579
580
#endif
#if COMPLEXCASE == 1
581
                                      size_of_PRECISION_complex,         &
582
#endif
583
                                      cudaMemcpyHostToDevice)
584
585
586
587
            check_memcpy_cuda("tridiag", successCUDA)
          endif ! useGU

#if REALCASE == 1
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
!            if (useGPU) then
             !u_col_dev(1:l_cols) = 0.
             !u_row_dev(1:l_rows) = 0.
             !v_col_dev(1:l_cols) = v_col(1:l_cols)
             !v_row_dev(1:l_rows) = v_row(1:l_rows)
!                do i=0,(istep-2)/tile_size
!                  l_col_beg = i*l_cols_per_tile+1
!                  l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
!                  if(l_col_end<l_col_beg) cycle
!                  do j=0,i
!                     l_row_beg = j*l_rows_per_tile+1
!                     l_row_end = min(l_rows,(j+1)*l_rows_per_tile)
!                     if(l_row_end<l_row_beg) cycle
!                     if(mod(n_iter,n_threads) == my_thread) then
!                       call cublasDGEMV('T',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,1.d0,a_dev(l_row_beg,l_col_beg),lda,v_row_dev(l_row_beg),1,1.d0,u_col_dev(l_col_beg),1)
!                       if(i/=j) call cublasDGEMV('N',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,1.d0,a_dev(l_row_beg,l_col_beg),lda,v_col_dev(l_col_beg),1,1.d0,u_row_dev(l_row_beg),1)
!                     endif
!                     n_iter = n_iter+1
!                  enddo
!                enddo
!
             !--- for now, just use DSYMV!!!
610
             ! a_dev -> a_mat ?
Pavel Kus's avatar
Pavel Kus committed
611
             !write(*,*) "ubound ", ubound(a_mat,1), "lda", lda, "lcols", l_cols
612
!              call PRECISION_SYMV('U', l_cols,  &
Pavel Kus's avatar
Pavel Kus committed
613
!                         1.d0, a_mat, ubound(a_mat,1),  &
614
!                         v_row, 1,  &
615
!                         0.d0, u_col, 1)
616
617
             !u_col(1:l_cols) = u_col_dev(1:l_cols)
             !u_row(1:l_rows) = u_row_dev(1:l_rows)
618

619
!            else !do not use GPU
620
621
#endif

Pavel Kus's avatar
Pavel Kus committed
622
#ifdef WITH_OPENMP
623
          call timer%start("OpenMP parallel")
624
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
Pavel Kus's avatar
Pavel Kus committed
625

626
627
          my_thread = omp_get_thread_num()
          n_threads = omp_get_num_threads()
Pavel Kus's avatar
Pavel Kus committed
628

629
          n_iter = 0
Pavel Kus's avatar
Pavel Kus committed
630

631
632
          uc_p(1:l_cols,my_thread) = 0.
          ur_p(1:l_rows,my_thread) = 0.
633
#endif /* WITH_OPENMP */
634
          do i= 0, (istep-2)/tile_size
635
636
            l_col_beg = i*l_cols_per_tile+1
            l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
637
638
            if (l_col_end < l_col_beg) cycle
            do j = 0, i
639
640
              l_row_beg = j*l_rows_per_tile+1
              l_row_end = min(l_rows,(j+1)*l_rows_per_tile)
641
              if (l_row_end < l_row_beg) cycle
Pavel Kus's avatar
Pavel Kus committed
642
#ifdef WITH_OPENMP
643
              if (mod(n_iter,n_threads) == my_thread) then
644
	        call timer%start("blas")
645
#if REALCASE == 1
646
                call PRECISION_GEMV('T',                    &
647
648
#endif
#if COMPLEXCASE == 1
649
650
651
652
653
654
                call PRECISION_GEMV('C',                    &
#endif
		                    l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
                                    ONE, a_mat(l_row_beg,l_col_beg), lda,         &
                                    v_row(l_row_beg), 1, ONE, uc_p(l_col_beg,my_thread), 1)

655
                if (i/=j) then
656
657
658
659
                  call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1,          &
                                      ONE, a_mat(l_row_beg,l_col_beg), lda, v_col(l_col_beg), 1,  &

                                      ONE, ur_p(l_row_beg,my_thread), 1)
660
                endif
661
	        call timer%stop("blas")
662
663
              endif
              n_iter = n_iter+1
Pavel Kus's avatar
Pavel Kus committed
664
#else /* WITH_OPENMP */
665

666
              if (useGPU) then
667
                a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
668
#if REALCASE == 1
669
                           size_of_PRECISION_real
670
671
#endif
#if COMPLEXCASE == 1
672
                           size_of_PRECISION_complex
673
#endif
674
	        call timer%start("cublas")
675
#if REALCASE == 1
676
                call cublas_PRECISION_GEMV('T',                        &
677
678
#endif
#if COMPLEXCASE == 1
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
                call cublas_PRECISION_GEMV('C',                        &
#endif
                                           l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,  &
                                           ONE, a_dev + a_offset, lda,                   &
                                           v_row_dev + (l_row_beg - 1) *                 &
#if REALCASE == 1
					   size_of_PRECISION_real, 1,                    &
#endif
#if COMPLEXCASE == 1
                                           size_of_PRECISION_complex, 1,                 &
#endif
                                           ONE, u_col_dev + (l_col_beg - 1) *            &
#if REALCASE == 1
					   size_of_PRECISION_real, 1)
#endif
#if COMPLEXCASE == 1
                                           size_of_PRECISION_complex, 1)
696
#endif
697

698
               if(i/=j) then
699
700
701
702
703
704
705
706
707
708
                  call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,  &
                                             ONE, a_dev + a_offset, lda,                        &
                                             v_col_dev + (l_col_beg - 1) *                      &
#if REALCASE == 1
					     size_of_PRECISION_real, 1,                         &
#endif
#if COMPLEXCASE == 1
                                             size_of_PRECISION_complex, 1,                      &
#endif
                                             ONE, u_row_dev + (l_row_beg - 1) *                 &
709
#if REALCASE == 1
710
					     size_of_PRECISION_real, 1)
711
712
#endif
#if COMPLEXCASE == 1
713
                                             size_of_PRECISION_complex, 1)
714
#endif
715

716
                endif
717
	        call timer%stop("cublas")
718
              else ! useGPU
719
	        call timer%start("blas")
720
721
#if REALCASE == 1
                call PRECISION_GEMV('T',           &
722
723
#endif
#if COMPLEXCASE == 1
724
                call PRECISION_GEMV('C',           &
725
#endif
726
727
728
729
	                            l_row_end-l_row_beg+1, l_col_end-l_col_beg+1,  &
                                    ONE, a_mat(l_row_beg, l_col_beg), lda,         &
                                    v_row(l_row_beg), 1,                           &
                                    ONE, u_col(l_col_beg), 1)
730
731

                if (i/=j) then
732

733
                  call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1,  &
734
735
736
                                      ONE, a_mat(l_row_beg,l_col_beg), lda,               &
                                      v_col(l_col_beg), 1,                                &
                                      ONE, u_row(l_row_beg), 1)
737
                endif
738
	        call timer%stop("blas")
739
              endif ! useGPU
Pavel Kus's avatar
Pavel Kus committed
740
741

#endif /* WITH_OPENMP */
742
            enddo  ! j=0,i
743
744
          enddo  ! i=0,(istep-2)/tile_size

745
          if (useGPU) then
746
            successCUDA = cuda_memcpy(loc(u_col(1)), u_col_dev, l_cols * &
747
#if REALCASE == 1
748
                                      size_of_PRECISION_real,    &
749
750
#endif
#if COMPLEXCASE == 1
751
                                      size_of_PRECISION_complex, &
752
#endif
753
                                      cudaMemcpyDeviceToHost)
754
            check_memcpy_cuda("tridiag", successCUDA)
755
756

            successCUDA = cuda_memcpy(loc(u_row(1)), u_row_dev, l_rows * &
757
#if REALCASE == 1
758
                                      size_of_PRECISION_real,    &
759
760
#endif
#if COMPLEXCASE == 1
761
                                      size_of_PRECISION_complex, &
762
#endif
763
                                      cudaMemcpyDeviceToHost)
764
            check_memcpy_cuda("tridiag", successCUDA)
765
          endif
766

767
!              call PRECISION_SYMV('U', l_cols,  &
Pavel Kus's avatar
Pavel Kus committed
768
!                         1.d0, a_mat, ubound(a_mat,1),  &
769
!                         v_row, 1,  &
770
!                         0.d0, u_col, 1)
771
772
773

!            endif ! useGPU

Pavel Kus's avatar
Pavel Kus committed
774
775
#ifdef WITH_OPENMP
!$OMP END PARALLEL
776
          call timer%stop("OpenMP parallel")
Pavel Kus's avatar
Pavel Kus committed
777

778
779
780
781
782
          do i=0,max_threads-1
            u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i)
            u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i)
          enddo
#endif /* WITH_OPENMP */
Pavel Kus's avatar
Pavel Kus committed
783

784
          if (n_stored_vecs > 0) then
785
	    call timer%start("blas")
786
#if REALCASE == 1
787
            call PRECISION_GEMV('T',     &
788
789
#endif
#if COMPLEXCASE == 1
790
            call PRECISION_GEMV('C',     &
791
#endif
792
793
794
795
796
797
798
	                        l_rows, 2*n_stored_vecs,   &
                                ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1),   &
                                v_row,  1, ZERO, aux, 1)

            call PRECISION_GEMV('N', l_cols, 2*n_stored_vecs,   &
                                ONE, uv_stored_cols, ubound(uv_stored_cols,dim=1),   &
                                aux, 1, ONE, u_col,  1)
799
	    call timer%stop("blas")
800
          endif
Pavel Kus's avatar
Pavel Kus committed
801

802
        endif  ! (l_rows>0 .and. l_cols>0)
Pavel Kus's avatar
Pavel Kus committed
803

804
        ! Sum up all u_row(:) parts along rows and add them to the u_col(:) parts
Pavel Kus's avatar
Pavel Kus committed
805
        ! on the processors containing the diagonal
806
        ! This is only necessary if u_row has been calculated, i.e. if the
Pavel Kus's avatar
Pavel Kus committed
807
808
809
        ! global tile size is smaller than the global remaining matrix

        if (tile_size < istep-1) then
810
811
812
813
814
815
816
817

	  call elpa_reduce_add_vectors_&
	  &MATH_DATATYPE&
	  &_&
	  &PRECISION &
          (u_row, ubound(u_row,dim=1), mpi_comm_rows, u_col, ubound(u_col,dim=1), &
	   mpi_comm_cols, istep-1, 1, nblk)

Pavel Kus's avatar
Pavel Kus committed
818
819
        endif

820
        ! Sum up all the u_col(:) parts, transpose u_col -> u_row
Pavel Kus's avatar
Pavel Kus committed
821
822

        if (l_cols>0) then
823
          tmp(1:l_cols) = u_col(1:l_cols)
Pavel Kus's avatar
Pavel Kus committed
824
825
#ifdef WITH_MPI
          call timer%start("mpi_communication")
826
          call mpi_allreduce(tmp, u_col, l_cols,    &
827
#if REALCASE == 1
828
                             MPI_REAL_PRECISION,    &
829
830
#endif
#if COMPLEXCASE == 1
831
                             MPI_COMPLEX_PRECISION, &
832
#endif
833
                             MPI_SUM, mpi_comm_rows, mpierr)
Pavel Kus's avatar
Pavel Kus committed
834
835
          call timer%stop("mpi_communication")
#else /* WITH_MPI */
836
          u_col = tmp
Pavel Kus's avatar
Pavel Kus committed
837
838
839
#endif /* WITH_MPI */
        endif

840
841
842
843
844
845
846
        call elpa_transpose_vectors_&
	&MATH_DATATYPE&
	&_&
	&PRECISION &
        (u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
	 mpi_comm_rows, 1, istep-1, 1, nblk)

Pavel Kus's avatar
Pavel Kus committed
847
        ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
848
#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
849
        x = 0
850
        if (l_cols>0)  &
851
           x = dot_product(v_col(1:l_cols),u_col(1:l_cols))
852
853
854
855
856
857
858
#endif
#if COMPLEXCASE == 1
        xc = 0
        if (l_cols>0)  &
           xc = dot_product(v_col(1:l_cols),u_col(1:l_cols))
#endif

Pavel Kus's avatar
Pavel Kus committed
859
860
#ifdef WITH_MPI
        call timer%start("mpi_communication")
861
#if REALCASE == 1
862
        call mpi_allreduce(x, vav, 1, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
863
864
865
866
#endif
#if COMPLEXCASE == 1
        call mpi_allreduce(xc, vav, 1 , MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
#endif
Pavel Kus's avatar
Pavel Kus committed
867
868
        call timer%stop("mpi_communication")
#else /* WITH_MPI */
869
870

#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
871
        vav = x
872
873
874
875
876
#endif
#if COMPLEXCASE == 1
        vav = xc
#endif

Pavel Kus's avatar
Pavel Kus committed
877
878
879
880
881
882
#endif /* WITH_MPI */

        ! store u and v in the matrices U and V
        ! these matrices are stored combined in one here

        do j=1,l_rows
883
#if REALCASE == 1
884
885
          vu_stored_rows(j,2*n_stored_vecs+1) = tau(istep)*v_row(j)
          vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*tau(istep)*vav*v_row(j) - u_row(j)
886
887
888
889
890
#endif
#if COMPLEXCASE == 1
          vu_stored_rows(j,2*n_stored_vecs+1) = conjg(tau(istep))*v_row(j)
          vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*conjg(tau(istep))*vav*v_row(j) - u_row(j)
#endif
Pavel Kus's avatar
Pavel Kus committed
891
892
        enddo
        do j=1,l_cols
893
#if REALCASE == 1
894
895
          uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*tau(istep)*vav*v_col(j) - u_col(j)
          uv_stored_cols(j,2*n_stored_vecs+2) = tau(istep)*v_col(j)
896
897
898
899
900
#endif
#if COMPLEXCASE == 1
          uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*conjg(tau(istep))*vav*v_col(j) - u_col(j)
          uv_stored_cols(j,2*n_stored_vecs+2) = conjg(tau(istep))*v_col(j)
#endif
Pavel Kus's avatar
Pavel Kus committed
901
902
        enddo

903
904
905
906
        ! We have calculated another Hauseholder vector, number of implicitly stored increased
        n_stored_vecs = n_stored_vecs+1

        ! If the limit of max_stored_uv is reached, calculate A + VU**T + UV**T
907
        if (n_stored_vecs == max_stored_uv .or. istep == 3) then
908
909
910

          if (useGPU) then
            successCUDA = cuda_memcpy(vu_stored_rows_dev, loc(vu_stored_rows(1,1)), &
911
912
913
                                      max_local_rows * 2 * max_stored_uv *          &
#if REALCASE == 1
                                      size_of_PRECISION_real,                       &
914
915
#endif
#if COMPLEXCASE == 1
916
917
                                      size_of_PRECISION_complex,                    &

918
#endif
919
                                      cudaMemcpyHostToDevice)
Pavel Kus's avatar
Pavel Kus committed
920
            check_memcpy_cuda("tridiag", successCUDA)
921

922
            successCUDA = cuda_memcpy(uv_stored_cols_dev, loc(uv_stored_cols(1,1)), &
923
924
925
                                      max_local_cols * 2 * max_stored_uv *          &
#if REALCASE == 1
                                      size_of_PRECISION_real,                       &
926
927
#endif
#if COMPLEXCASE == 1
928
929
                                      size_of_PRECISION_complex,                    &

930
#endif
931
                                      cudaMemcpyHostToDevice)
Pavel Kus's avatar
Pavel Kus committed
932
            check_memcpy_cuda("tridiag", successCUDA)
933
          endif
Pavel Kus's avatar
Pavel Kus committed
934

935
          do i = 0, (istep-2)/tile_size
936
937
938
939
940
            l_col_beg = i*l_cols_per_tile+1
            l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
            l_row_beg = 1
            l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
            if (l_col_end<l_col_beg .or. l_row_end<l_row_beg) &
941
              cycle
942
943

            if (useGPU) then
944
	      call timer%start("cublas")
945
#if REALCASE == 1
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
              call cublas_PRECISION_GEMM('N', 'T',     &
#endif
#if COMPLEXCASE == 1
              call cublas_PRECISION_GEMM('N', 'C',     &
#endif
	                                 l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs,                      &
                                         ONE, vu_stored_rows_dev + (l_row_beg - 1) *                                         &
#if REALCASE == 1
                                         size_of_PRECISION_real,    &
#endif
#if COMPLEXCASE == 1
                                         size_of_PRECISION_complex, &
#endif

                                         max_local_rows, uv_stored_cols_dev + (l_col_beg - 1) *                              &
#if REALCASE == 1
                                         size_of_PRECISION_real, &
963
964
#endif
#if COMPLEXCASE == 1